FileDocCategorySizeDatePackage
IndexHTML.javaAPI DocApache Lucene 1.95529Mon Feb 27 11:12:38 GMT 2006org.apache.lucene.demo

IndexHTML

public class IndexHTML extends Object
Indexer for HTML files.

Fields Summary
private static boolean
deleting
private static IndexReader
reader
private static IndexWriter
writer
private static TermEnum
uidIter
Constructors Summary
private IndexHTML()

Methods Summary
private static voidindexDocs(java.io.File file, java.lang.String index, boolean create)

    if (!create) {				  // incrementally update

      reader = IndexReader.open(index);		  // open existing index
      uidIter = reader.terms(new Term("uid", "")); // init uid iterator

      indexDocs(file);

      if (deleting) {				  // delete rest of stale docs
        while (uidIter.term() != null && uidIter.term().field() == "uid") {
          System.out.println("deleting " +
              HTMLDocument.uid2url(uidIter.term().text()));
          reader.delete(uidIter.term());
          uidIter.next();
        }
        deleting = false;
      }

      uidIter.close();				  // close uid iterator
      reader.close();				  // close existing index

    } else					  // don't have exisiting
      indexDocs(file);
  
private static voidindexDocs(java.io.File file)

    if (file.isDirectory()) {			  // if a directory
      String[] files = file.list();		  // list its files
      Arrays.sort(files);			  // sort the files
      for (int i = 0; i < files.length; i++)	  // recursively index them
        indexDocs(new File(file, files[i]));

    } else if (file.getPath().endsWith(".html") || // index .html files
      file.getPath().endsWith(".htm") || // index .htm files
      file.getPath().endsWith(".txt")) { // index .txt files

      if (uidIter != null) {
        String uid = HTMLDocument.uid(file);	  // construct uid for doc

        while (uidIter.term() != null && uidIter.term().field() == "uid" &&
            uidIter.term().text().compareTo(uid) < 0) {
          if (deleting) {			  // delete stale docs
            System.out.println("deleting " +
                HTMLDocument.uid2url(uidIter.term().text()));
            reader.delete(uidIter.term());
          }
          uidIter.next();
        }
        if (uidIter.term() != null && uidIter.term().field() == "uid" &&
            uidIter.term().text().compareTo(uid) == 0) {
          uidIter.next();			  // keep matching docs
        } else if (!deleting) {			  // add new docs
          Document doc = HTMLDocument.Document(file);
          System.out.println("adding " + doc.get("path"));
          writer.addDocument(doc);
        }
      } else {					  // creating a new index
        Document doc = HTMLDocument.Document(file);
        System.out.println("adding " + doc.get("path"));
        writer.addDocument(doc);		  // add docs unconditionally
      }
    }
  
public static voidmain(java.lang.String[] argv)
Indexer for HTML files.

		  // document id iterator

      
       
    try {
      String index = "index";
      boolean create = false;
      File root = null;

      String usage = "IndexHTML [-create] [-index <index>] <root_directory>";

      if (argv.length == 0) {
        System.err.println("Usage: " + usage);
        return;
      }

      for (int i = 0; i < argv.length; i++) {
        if (argv[i].equals("-index")) {		  // parse -index option
          index = argv[++i];
        } else if (argv[i].equals("-create")) {	  // parse -create option
          create = true;
        } else if (i != argv.length-1) {
          System.err.println("Usage: " + usage);
          return;
        } else
          root = new File(argv[i]);
      }

      Date start = new Date();

      if (!create) {				  // delete stale docs
        deleting = true;
        indexDocs(root, index, create);
      }
      writer = new IndexWriter(index, new StandardAnalyzer(), create);
      writer.setMaxFieldLength(1000000);
      indexDocs(root, index, create);		  // add new docs

      System.out.println("Optimizing index...");
      writer.optimize();
      writer.close();

      Date end = new Date();

      System.out.print(end.getTime() - start.getTime());
      System.out.println(" total milliseconds");

    } catch (Exception e) {
      System.out.println(" caught a " + e.getClass() +
          "\n with message: " + e.getMessage());
    }