Methods Summary |
---|
public java.lang.String | convertTag(java.lang.String tag)
// temporary position variables
int p1, p2, p3, p4;
try {
// HTML tags are cases insensitive so converting
// it to upper case makes the problem slightly easier
String s1 = tag.toUpperCase();
// Find the beginning and the end of the URL
//
if (s1.startsWith("<A HREF")) {
p1 = s1.indexOf("HREF");
}
else if (s1.startsWith("<IMG ")) {
p1 = s1.indexOf("SRC");
}
else if (s1.startsWith("<APPLET ")) {
p1 = s1.indexOf("CODEBASE");
}
else { // this is not a link based tag
return tag;
}
// find the =
p2 = s1.indexOf ("=", p1);
if (p2 == -1) return tag;
// Ideally the = sign is immediately followed by
// a " mark followed by the URL which is closed by a ".
// However since a lot of HTML is non-conforming we
// need to be a little sneakier. In this case we read
// characters in the URL until an character which is not
// whitespace is encountered.
p3 = p2+1;
while (Character.isSpace(s1.charAt(p3))) {
p3++;
}
if (s1.charAt(p3) == '"") p3++;
// p3 now points to the beginning of the URL
// The URL is read until a closing " or whitespace is seen
p4 = p3+1;
while (!Character.isSpace(s1.charAt(p4)) &&
s1.charAt(p4) != '"") {
p4++;
}
// The URL is the text between p3 and p4
// URL's are in general NOT case insensitive so the URL
// must be read from the original tag and not from s1
// which was uppercased
String link = tag.substring(p3, p4);
// Is it a relative URL? Relative URLs
// don't contain colons.
if (link.indexOf(":") == -1) {
// build an absolute URL from the relative URL
URL newURL = new URL(theURL, link);
// replace the old URL with the new URL
tag = s1.substring(0,p3) + newURL + s1.substring(p4,s1.length());
} // end if
} // end try
catch (StringIndexOutOfBoundsException e) {
// Most of the time a StringIndexOutOfBoundsException here means
// the tag was not standard conforming so
// the algorithm for finding the URL crapped out.
// If that's the case, the original tag is returned.
}
catch (Exception e) {
System.err.println(e);
}
return tag;
|
public static void | main(java.lang.String[] args)
// Loop through the command line arguments
for (int i = 0; i < args.length; i++) {
//Open the URL for reading
try {
URL root = new URL(args[0]);
PageSaver ps = new PageSaver(root);
ps.saveThePage();
}
catch (MalformedURLException e) {
System.err.println(args[0] + " is not a parseable URL");
System.err.println(e);
}
} // end for
|
public java.io.PrintStream | makeOutputFile()
FileOutputStream fout;
String theFile = theURL.getFile();
// the getFile method returns the filename prefixed with a slash,
// e.g. /index.html instead of index.html. That slash needs to be removed.
theFile = theFile.substring(1);
System.err.println("\n\n\n" + theFile + "\n\n\n");
if (theFile.equals("")) theFile = "index.html";
// At this point you should check to see whether
// the file already exists and, if it does,
// ask the user if they wish to overwrite it
fout = new FileOutputStream(theFile);
return new PrintStream(fout);
|
public static java.lang.String | readTag(java.io.DataInputStream is)
StringBuffer theTag = new StringBuffer("<");
char theChar = '<";
try {
while (theChar != '>") {
theChar = (char) is.readByte();
theTag.append(theChar);
} // end while
} // end try
catch (EOFException e) {
// Done with the Stream
}
catch (Exception e) {
System.err.println(e);
}
return theTag.toString();
|
public void | saveThePage()
char thisChar;
String theTag;
PrintStream p = null;
try {
DataInputStream theHTML = new DataInputStream(theURL.openStream());
p = makeOutputFile();
while (true) {
thisChar = (char) theHTML.readByte();
if (thisChar == '<") {
theTag = readTag(theHTML);
theTag = convertTag(theTag);
p.print(theTag);
}
else {
p.print(thisChar);
}
} // end while
} // end try
catch (EOFException e) { // This page is done
}
catch (Exception e) {
System.err.println(e);
}
finally {
p.close();
}
|