FileDocCategorySizeDatePackage
PageSaver.javaAPI DocExample5876Thu Apr 03 15:15:26 BST 1997None

PageSaver

public class PageSaver extends Object

Fields Summary
URL
theURL
Constructors Summary
public PageSaver(URL u)

  
    theURL = u;
    
  
Methods Summary
public java.lang.StringconvertTag(java.lang.String tag)

  
    // temporary position variables
    int p1, p2, p3, p4;
  
    try {
      // HTML tags are cases insensitive so converting
      // it to upper case makes the problem slightly easier
      String s1 = tag.toUpperCase();
      // Find the beginning and the end of the URL
      //
      if (s1.startsWith("<A HREF")) {
        p1 = s1.indexOf("HREF");
      }
      else if (s1.startsWith("<IMG ")) {
        p1 = s1.indexOf("SRC");
      }      
      else if (s1.startsWith("<APPLET ")) {
        p1 = s1.indexOf("CODEBASE");
      }
      else { // this is not a link based tag
        return tag;      
      }
      // find the =
      p2 = s1.indexOf ("=", p1);
      if (p2 == -1) return tag;
      // Ideally the = sign is immediately followed by
      // a " mark followed by the URL which is closed by a ".
      // However since a lot of HTML is non-conforming we
      // need to be a little sneakier. In this case we read
      // characters in the URL until an character which is not
      // whitespace is encountered.
      p3 = p2+1;
      while (Character.isSpace(s1.charAt(p3))) {
        p3++;
      } 
      if (s1.charAt(p3) == '"") p3++;

      // p3 now points to the beginning of the URL
      // The URL is read until a closing " or whitespace is seen
      p4 = p3+1;
      while (!Character.isSpace(s1.charAt(p4)) && 
       s1.charAt(p4) != '"") {
        p4++;
      }
      
      // The URL is the text between p3 and p4
      // URL's are in general NOT case insensitive so the URL
      // must be read from the original tag and not from s1
      // which was uppercased
      String link = tag.substring(p3, p4);

      // Is it a relative URL?  Relative URLs
      // don't contain colons.
      if (link.indexOf(":")  == -1) {
           // build an absolute URL from the relative URL
           URL newURL = new URL(theURL, link);
           // replace the old URL with the new URL
           tag = s1.substring(0,p3) + newURL + s1.substring(p4,s1.length());
        }  // end if      

    }  // end try
    catch (StringIndexOutOfBoundsException e) {
      // Most of the time a StringIndexOutOfBoundsException here means
      // the tag was not standard conforming so 
      // the algorithm for finding the URL crapped out.
      // If that's the case, the original tag is returned.
    }
    catch (Exception e) {
      System.err.println(e);
    }
    

    return tag;
  
  
public static voidmain(java.lang.String[] args)


    // Loop through the command line arguments
    for  (int i = 0; i < args.length; i++) {

      //Open the URL for reading
      try {
        URL root = new URL(args[0]);
        PageSaver ps = new PageSaver(root);
        ps.saveThePage();
      }
      catch (MalformedURLException e) {
        System.err.println(args[0] + " is not a parseable URL");
        System.err.println(e);
      }
    } //  end for

  
public java.io.PrintStreammakeOutputFile()


    FileOutputStream fout;
      
    String theFile = theURL.getFile();
    
    // the getFile method returns the filename prefixed with a slash,
    // e.g. /index.html instead of index.html. That slash needs to be removed.
    theFile = theFile.substring(1);
    System.err.println("\n\n\n" + theFile + "\n\n\n");
    if (theFile.equals("")) theFile = "index.html";
    
    // At this point you should check to see whether
    // the file already exists and, if it does,
    // ask the user if they wish to overwrite it
    
    fout = new FileOutputStream(theFile);

    return new PrintStream(fout);
  
  
public static java.lang.StringreadTag(java.io.DataInputStream is)

  
    StringBuffer theTag = new StringBuffer("<");
    char theChar = '<";
  
    try {
       while (theChar != '>") {
         theChar = (char) is.readByte();
         theTag.append(theChar);
       } // end while
     }  // end try
     catch (EOFException e) {
       // Done with the Stream
     }
     catch (Exception e) {
        System.err.println(e);
     }     

     return theTag.toString();
  
  
public voidsaveThePage()

  
   char thisChar;
   String theTag;
   PrintStream  p = null;
  
   try {
     DataInputStream theHTML = new DataInputStream(theURL.openStream());
     p = makeOutputFile();
 
     while (true) {
       thisChar = (char) theHTML.readByte();
       if (thisChar == '<") {
         theTag = readTag(theHTML);
         theTag = convertTag(theTag);
         p.print(theTag);
       }
       else {
         p.print(thisChar);
       }
     } // end while 
   }  // end try
   catch (EOFException e) {  // This page is done
   }   
   catch (Exception e) {
      System.err.println(e);
   }
   finally {
     p.close();
   }