import java.net.*;
import java.io.*;
public class PageSaver {
URL theURL;
public static void main (String args[]) {
// Loop through the command line arguments
for (int i = 0; i < args.length; i++) {
//Open the URL for reading
try {
URL root = new URL(args[0]);
PageSaver ps = new PageSaver(root);
ps.saveThePage();
}
catch (MalformedURLException e) {
System.err.println(args[0] + " is not a parseable URL");
System.err.println(e);
}
} // end for
} // end main
public PageSaver(URL u) {
theURL = u;
}
// saveThePage opens a DataInputStream from the URL,
// opens a PrintStream onto a file for the output,
// and then copies one to the other while rewriting tags
public void saveThePage() {
char thisChar;
String theTag;
PrintStream p = null;
try {
DataInputStream theHTML = new DataInputStream(theURL.openStream());
p = makeOutputFile();
while (true) {
thisChar = (char) theHTML.readByte();
if (thisChar == '<') {
theTag = readTag(theHTML);
theTag = convertTag(theTag);
p.print(theTag);
}
else {
p.print(thisChar);
}
} // end while
} // end try
catch (EOFException e) { // This page is done
}
catch (Exception e) {
System.err.println(e);
}
finally {
p.close();
}
} // end SaveThePage
// We need open a file on the local file system
// with the same name as the remote file;
// then chain a PrintStream to the file
public PrintStream makeOutputFile() throws IOException {
FileOutputStream fout;
String theFile = theURL.getFile();
// the getFile method returns the filename prefixed with a slash,
// e.g. /index.html instead of index.html. That slash needs to be removed.
theFile = theFile.substring(1);
System.err.println("\n\n\n" + theFile + "\n\n\n");
if (theFile.equals("")) theFile = "index.html";
// At this point you should check to see whether
// the file already exists and, if it does,
// ask the user if they wish to overwrite it
fout = new FileOutputStream(theFile);
return new PrintStream(fout);
}
// The readTag method is called when a < is encountered
// in the input stream. This method is responsible
// for reading the remainder of the tag.
// Note that when this method has been called the <
// has been read from the input stream but has not yet been sent
// to the output stream.
// This method has trouble (as do most web browsers)
// if it encounters a raw < sign in the Stream. Technically
// raw < signs should be encoded as < in the original HTML.
public static String readTag(DataInputStream is) {
StringBuffer theTag = new StringBuffer("<");
char theChar = '<';
try {
while (theChar != '>') {
theChar = (char) is.readByte();
theTag.append(theChar);
} // end while
} // end try
catch (EOFException e) {
// Done with the Stream
}
catch (Exception e) {
System.err.println(e);
}
return theTag.toString();
}
// The convertTag method takes a complete tag as
// a String and, if it's a relative link, converts it
// to an absolute link. The converted tag is returned.
public String convertTag(String tag) {
// temporary position variables
int p1, p2, p3, p4;
try {
// HTML tags are cases insensitive so converting
// it to upper case makes the problem slightly easier
String s1 = tag.toUpperCase();
// Find the beginning and the end of the URL
//
if (s1.startsWith("<A HREF")) {
p1 = s1.indexOf("HREF");
}
else if (s1.startsWith("<IMG ")) {
p1 = s1.indexOf("SRC");
}
else if (s1.startsWith("<APPLET ")) {
p1 = s1.indexOf("CODEBASE");
}
else { // this is not a link based tag
return tag;
}
// find the =
p2 = s1.indexOf ("=", p1);
if (p2 == -1) return tag;
// Ideally the = sign is immediately followed by
// a " mark followed by the URL which is closed by a ".
// However since a lot of HTML is non-conforming we
// need to be a little sneakier. In this case we read
// characters in the URL until an character which is not
// whitespace is encountered.
p3 = p2+1;
while (Character.isSpace(s1.charAt(p3))) {
p3++;
}
if (s1.charAt(p3) == '"') p3++;
// p3 now points to the beginning of the URL
// The URL is read until a closing " or whitespace is seen
p4 = p3+1;
while (!Character.isSpace(s1.charAt(p4)) &&
s1.charAt(p4) != '"') {
p4++;
}
// The URL is the text between p3 and p4
// URL's are in general NOT case insensitive so the URL
// must be read from the original tag and not from s1
// which was uppercased
String link = tag.substring(p3, p4);
// Is it a relative URL? Relative URLs
// don't contain colons.
if (link.indexOf(":") == -1) {
// build an absolute URL from the relative URL
URL newURL = new URL(theURL, link);
// replace the old URL with the new URL
tag = s1.substring(0,p3) + newURL + s1.substring(p4,s1.length());
} // end if
} // end try
catch (StringIndexOutOfBoundsException e) {
// Most of the time a StringIndexOutOfBoundsException here means
// the tag was not standard conforming so
// the algorithm for finding the URL crapped out.
// If that's the case, the original tag is returned.
}
catch (Exception e) {
System.err.println(e);
}
return tag;
}
}
|