FileDocCategorySizeDatePackage
HtmlDocument.javaAPI DocApache Lucene 2.1.07395Wed Feb 14 10:45:44 GMT 2007org.apache.lucene.ant

HtmlDocument

public class HtmlDocument extends Object
The HtmlDocument class creates a Lucene {@link org.apache.lucene.document.Document} from an HTML document.

It does this by using JTidy package. It can take input input from {@link java.io.File} or {@link java.io.InputStream}.

author
Erik Hatcher

Fields Summary
private Element
rawDoc
Constructors Summary
public HtmlDocument(File file)
Constructs an HtmlDocument from a {@link java.io.File}.

param
file the File containing the HTML to parse
exception
IOException if an I/O exception occurs

        Tidy tidy = new Tidy();
        tidy.setQuiet(true);
        tidy.setShowWarnings(false);
        org.w3c.dom.Document root =
                tidy.parseDOM(new FileInputStream(file), null);
        rawDoc = root.getDocumentElement();
    
public HtmlDocument(InputStream is)
Constructs an HtmlDocument from an {@link java.io.InputStream}.

param
is the InputStream containing the HTML

        Tidy tidy = new Tidy();
        tidy.setQuiet(true);
        tidy.setShowWarnings(false);
        org.w3c.dom.Document root = tidy.parseDOM(is, null);
        rawDoc = root.getDocumentElement();
    
Methods Summary
public static org.apache.lucene.document.DocumentDocument(java.io.File file)
Creates a Lucene Document from a {@link java.io.File}.

param
file
exception
IOException

        HtmlDocument htmlDoc = new HtmlDocument(file);
        org.apache.lucene.document.Document luceneDoc =
                new org.apache.lucene.document.Document();

        luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
        luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.TOKENIZED));

        String contents = null;
        BufferedReader br =
                new BufferedReader(new FileReader(file));
        StringWriter sw = new StringWriter();
        String line = br.readLine();
        while (line != null) {
            sw.write(line);
            line = br.readLine();
        }
        br.close();
        contents = sw.toString();
        sw.close();

        luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));

        return luceneDoc;
    
public java.lang.StringgetBody()
Gets the bodyText attribute of the HtmlDocument object.

return
the bodyText value

        if (rawDoc == null) {
            return null;
        }

        String body = "";
        NodeList nl = rawDoc.getElementsByTagName("body");
        if (nl.getLength() > 0) {
            body = getBodyText(nl.item(0));
        }
        return body;
    
private java.lang.StringgetBodyText(org.w3c.dom.Node node)
Gets the bodyText attribute of the HtmlDocument object.

param
node a DOM Node
return
The bodyText value

        NodeList nl = node.getChildNodes();
        StringBuffer buffer = new StringBuffer();
        for (int i = 0; i < nl.getLength(); i++) {
            Node child = nl.item(i);
            switch (child.getNodeType()) {
                case Node.ELEMENT_NODE:
                    buffer.append(getBodyText(child));
                    buffer.append(" ");
                    break;
                case Node.TEXT_NODE:
                    buffer.append(((Text) child).getData());
                    break;
            }
        }
        return buffer.toString();
    
public static org.apache.lucene.document.DocumentgetDocument(java.io.InputStream is)
Creates a Lucene Document from an {@link java.io.InputStream}.

param
is

        HtmlDocument htmlDoc = new HtmlDocument(is);
        org.apache.lucene.document.Document luceneDoc =
                new org.apache.lucene.document.Document();

        luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
        luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.TOKENIZED));

        return luceneDoc;
    
public java.lang.StringgetTitle()
Gets the title attribute of the HtmlDocument object.

return
the title value

        if (rawDoc == null) {
            return null;
        }

        String title = "";

        NodeList nl = rawDoc.getElementsByTagName("title");
        if (nl.getLength() > 0) {
            Element titleElement = ((Element) nl.item(0));
            Text text = (Text) titleElement.getFirstChild();
            if (text != null) {
                title = text.getData();
            }
        }
        return title;
    
public static voidmain(java.lang.String[] args)
Runs HtmlDocument on the files specified on the command line.

param
args Command line arguments
exception
Exception Description of Exception

//         HtmlDocument doc = new HtmlDocument(new File(args[0]));
//         System.out.println("Title = " + doc.getTitle());
//         System.out.println("Body  = " + doc.getBody());

        HtmlDocument doc =
                new HtmlDocument(new FileInputStream(new File(args[0])));
        System.out.println("Title = " + doc.getTitle());
        System.out.println("Body  = " + doc.getBody());