FileDocCategorySizeDatePackage
HTMLDocument.javaAPI DocApache Lucene 2.1.03512Wed Feb 14 10:46:42 GMT 2007org.apache.lucene.demo

HTMLDocument

public class HTMLDocument extends Object
A utility for making Lucene Documents for HTML documents.

Fields Summary
static char
dirSep
Constructors Summary
private HTMLDocument()

Methods Summary
public static org.apache.lucene.document.DocumentDocument(java.io.File f)

    // make a new, empty document
    Document doc = new Document();

    // Add the url as a field named "path".  Use a field that is 
    // indexed (i.e. searchable), but don't tokenize the field into words.
    doc.add(new Field("path", f.getPath().replace(dirSep, '/"), Field.Store.YES,
        Field.Index.UN_TOKENIZED));

    // Add the last modified date of the file a field named "modified".  
    // Use a field that is indexed (i.e. searchable), but don't tokenize
    // the field into words.
    doc.add(new Field("modified",
        DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
        Field.Store.YES, Field.Index.UN_TOKENIZED));

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));

    FileInputStream fis = new FileInputStream(f);
    HTMLParser parser = new HTMLParser(fis);
      
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field("contents", parser.getReader()));

    // Add the summary as a field that is stored and returned with
    // hit documents for display.
    doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));

    // Add the title as a field that it can be searched and that is stored.
    doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));

    // return the document
    return doc;
  
public static java.lang.Stringuid(java.io.File f)


       
    // Append path and date into a string in such a way that lexicographic
    // sorting gives the same results as a walk of the file hierarchy.  Thus
    // null (\u0000) is used both to separate directory components and to
    // separate the path from the date.
    return f.getPath().replace(dirSep, '\u0000") +
      "\u0000" +
      DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND);
  
public static java.lang.Stringuid2url(java.lang.String uid)

    String url = uid.replace('\u0000", '/");	  // replace nulls with slashes
    return url.substring(0, url.lastIndexOf('/")); // remove date from end