FileDocCategorySizeDatePackage
PageSaver.javaAPI DocExample5876Thu Apr 03 15:15:26 BST 1997None

PageSaver.java

import java.net.*;
import java.io.*;

public class PageSaver {

  URL theURL;

  public static void main (String args[]) {

    // Loop through the command line arguments
    for  (int i = 0; i < args.length; i++) {

      //Open the URL for reading
      try {
        URL root = new URL(args[0]);
        PageSaver ps = new PageSaver(root);
        ps.saveThePage();
      }
      catch (MalformedURLException e) {
        System.err.println(args[0] + " is not a parseable URL");
        System.err.println(e);
      }
    } //  end for

  } // end main


  public PageSaver(URL u) {
  
    theURL = u;
    
  }

  // saveThePage opens a DataInputStream from the URL,
  // opens a PrintStream onto a file for the output, 
  // and then copies one to the other while rewriting tags
  public void saveThePage() {
  
   char thisChar;
   String theTag;
   PrintStream  p = null;
  
   try {
     DataInputStream theHTML = new DataInputStream(theURL.openStream());
     p = makeOutputFile();
 
     while (true) {
       thisChar = (char) theHTML.readByte();
       if (thisChar == '<') {
         theTag = readTag(theHTML);
         theTag = convertTag(theTag);
         p.print(theTag);
       }
       else {
         p.print(thisChar);
       }
     } // end while 
   }  // end try
   catch (EOFException e) {  // This page is done
   }   
   catch (Exception e) {
      System.err.println(e);
   }
   finally {
     p.close();
   }

  }  // end SaveThePage
  
    
  // We need open a file on the local file system
  // with the same name as the remote file;
  // then chain a PrintStream to the file
  public PrintStream makeOutputFile() throws IOException {

    FileOutputStream fout;
      
    String theFile = theURL.getFile();
    
    // the getFile method returns the filename prefixed with a slash,
    // e.g. /index.html instead of index.html. That slash needs to be removed.
    theFile = theFile.substring(1);
    System.err.println("\n\n\n" + theFile + "\n\n\n");
    if (theFile.equals("")) theFile = "index.html";
    
    // At this point you should check to see whether
    // the file already exists and, if it does,
    // ask the user if they wish to overwrite it
    
    fout = new FileOutputStream(theFile);

    return new PrintStream(fout);
  
  }
  
  // The readTag method is called when a < is encountered
  // in the input stream.  This method is responsible
  // for reading the remainder of the tag.
  // Note that when this method has been called the <
  // has been read from the input stream but has not yet been sent
  // to the output stream. 
  // This method has trouble (as do most web browsers) 
  // if it encounters a raw < sign in the Stream. Technically
  // raw < signs should be encoded as < in the original HTML.
  public static String readTag(DataInputStream is) {
  
    StringBuffer theTag = new StringBuffer("<");
    char theChar = '<';
  
    try {
       while (theChar != '>') {
         theChar = (char) is.readByte();
         theTag.append(theChar);
       } // end while
     }  // end try
     catch (EOFException e) {
       // Done with the Stream
     }
     catch (Exception e) {
        System.err.println(e);
     }     

     return theTag.toString();
  
  }

  // The convertTag method takes a complete tag as
  // a String and, if it's a relative link, converts it
  // to an absolute link.  The converted tag is returned.
  public String convertTag(String tag) {
  
    // temporary position variables
    int p1, p2, p3, p4;
  
    try {
      // HTML tags are cases insensitive so converting
      // it to upper case makes the problem slightly easier
      String s1 = tag.toUpperCase();
      // Find the beginning and the end of the URL
      //
      if (s1.startsWith("<A HREF")) {
        p1 = s1.indexOf("HREF");
      }
      else if (s1.startsWith("<IMG ")) {
        p1 = s1.indexOf("SRC");
      }      
      else if (s1.startsWith("<APPLET ")) {
        p1 = s1.indexOf("CODEBASE");
      }
      else { // this is not a link based tag
        return tag;      
      }
      // find the =
      p2 = s1.indexOf ("=", p1);
      if (p2 == -1) return tag;
      // Ideally the = sign is immediately followed by
      // a " mark followed by the URL which is closed by a ".
      // However since a lot of HTML is non-conforming we
      // need to be a little sneakier. In this case we read
      // characters in the URL until an character which is not
      // whitespace is encountered.
      p3 = p2+1;
      while (Character.isSpace(s1.charAt(p3))) {
        p3++;
      } 
      if (s1.charAt(p3) == '"') p3++;

      // p3 now points to the beginning of the URL
      // The URL is read until a closing " or whitespace is seen
      p4 = p3+1;
      while (!Character.isSpace(s1.charAt(p4)) && 
       s1.charAt(p4) != '"') {
        p4++;
      }
      
      // The URL is the text between p3 and p4
      // URL's are in general NOT case insensitive so the URL
      // must be read from the original tag and not from s1
      // which was uppercased
      String link = tag.substring(p3, p4);

      // Is it a relative URL?  Relative URLs
      // don't contain colons.
      if (link.indexOf(":")  == -1) {
           // build an absolute URL from the relative URL
           URL newURL = new URL(theURL, link);
           // replace the old URL with the new URL
           tag = s1.substring(0,p3) + newURL + s1.substring(p4,s1.length());
        }  // end if      

    }  // end try
    catch (StringIndexOutOfBoundsException e) {
      // Most of the time a StringIndexOutOfBoundsException here means
      // the tag was not standard conforming so 
      // the algorithm for finding the URL crapped out.
      // If that's the case, the original tag is returned.
    }
    catch (Exception e) {
      System.err.println(e);
    }
    

    return tag;
  
  }
  
}