File Doc Category Size Date Package
PageSaver.java API Doc Example 5876 Thu Apr 03 15:15:26 BST 1997 None

PageSaver

java.lang.Object

public class PageSaver extends Object

Fields Summary
URL
theURL
Constructors Summary
public PageSaver(URL u)
theURL = u;
Methods Summary
public java.lang.String convertTag(java.lang.String tag)
// temporary position variables int p1, p2, p3, p4; try { // HTML tags are cases insensitive so converting // it to upper case makes the problem slightly easier String s1 = tag.toUpperCase(); // Find the beginning and the end of the URL // if (s1.startsWith("<A HREF")) { p1 = s1.indexOf("HREF"); } else if (s1.startsWith("<IMG ")) { p1 = s1.indexOf("SRC"); } else if (s1.startsWith("<APPLET ")) { p1 = s1.indexOf("CODEBASE"); } else { // this is not a link based tag return tag; } // find the = p2 = s1.indexOf ("=", p1); if (p2 == -1) return tag; // Ideally the = sign is immediately followed by // a " mark followed by the URL which is closed by a ". // However since a lot of HTML is non-conforming we // need to be a little sneakier. In this case we read // characters in the URL until an character which is not // whitespace is encountered. p3 = p2+1; while (Character.isSpace(s1.charAt(p3))) { p3++; } if (s1.charAt(p3) == '"") p3++; // p3 now points to the beginning of the URL // The URL is read until a closing " or whitespace is seen p4 = p3+1; while (!Character.isSpace(s1.charAt(p4)) && s1.charAt(p4) != '"") { p4++; } // The URL is the text between p3 and p4 // URL's are in general NOT case insensitive so the URL // must be read from the original tag and not from s1 // which was uppercased String link = tag.substring(p3, p4); // Is it a relative URL? Relative URLs // don't contain colons. if (link.indexOf(":") == -1) { // build an absolute URL from the relative URL URL newURL = new URL(theURL, link); // replace the old URL with the new URL tag = s1.substring(0,p3) + newURL + s1.substring(p4,s1.length()); } // end if } // end try catch (StringIndexOutOfBoundsException e) { // Most of the time a StringIndexOutOfBoundsException here means // the tag was not standard conforming so // the algorithm for finding the URL crapped out. // If that's the case, the original tag is returned. } catch (Exception e) { System.err.println(e); } return tag;
public static void main(java.lang.String[] args)
// Loop through the command line arguments for (int i = 0; i < args.length; i++) { //Open the URL for reading try { URL root = new URL(args[0]); PageSaver ps = new PageSaver(root); ps.saveThePage(); } catch (MalformedURLException e) { System.err.println(args[0] + " is not a parseable URL"); System.err.println(e); } } // end for
public java.io.PrintStream makeOutputFile()
FileOutputStream fout; String theFile = theURL.getFile(); // the getFile method returns the filename prefixed with a slash, // e.g. /index.html instead of index.html. That slash needs to be removed. theFile = theFile.substring(1); System.err.println("\n\n\n" + theFile + "\n\n\n"); if (theFile.equals("")) theFile = "index.html"; // At this point you should check to see whether // the file already exists and, if it does, // ask the user if they wish to overwrite it fout = new FileOutputStream(theFile); return new PrintStream(fout);
public static java.lang.String readTag(java.io.DataInputStream is)
StringBuffer theTag = new StringBuffer("<"); char theChar = '<"; try { while (theChar != '>") { theChar = (char) is.readByte(); theTag.append(theChar); } // end while } // end try catch (EOFException e) { // Done with the Stream } catch (Exception e) { System.err.println(e); } return theTag.toString();
public void saveThePage()
char thisChar; String theTag; PrintStream p = null; try { DataInputStream theHTML = new DataInputStream(theURL.openStream()); p = makeOutputFile(); while (true) { thisChar = (char) theHTML.readByte(); if (thisChar == '<") { theTag = readTag(theHTML); theTag = convertTag(theTag); p.print(theTag); } else { p.print(thisChar); } } // end while } // end try catch (EOFException e) { // This page is done } catch (Exception e) { System.err.println(e); } finally { p.close(); }