Methods Summary |
---|
public static org.apache.lucene.document.Document | Document(java.io.File file)Creates a Lucene Document from a {@link
java.io.File}.
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.TOKENIZED));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
return luceneDoc;
|
public java.lang.String | getBody()Gets the bodyText attribute of the
HtmlDocument object.
if (rawDoc == null) {
return null;
}
String body = "";
NodeList nl = rawDoc.getElementsByTagName("body");
if (nl.getLength() > 0) {
body = getBodyText(nl.item(0));
}
return body;
|
private java.lang.String | getBodyText(org.w3c.dom.Node node)Gets the bodyText attribute of the
HtmlDocument object.
NodeList nl = node.getChildNodes();
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < nl.getLength(); i++) {
Node child = nl.item(i);
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
buffer.append(getBodyText(child));
buffer.append(" ");
break;
case Node.TEXT_NODE:
buffer.append(((Text) child).getData());
break;
}
}
return buffer.toString();
|
public static org.apache.lucene.document.Document | getDocument(java.io.InputStream is)Creates a Lucene Document from an {@link
java.io.InputStream}.
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.TOKENIZED));
return luceneDoc;
|
public java.lang.String | getTitle()Gets the title attribute of the HtmlDocument
object.
if (rawDoc == null) {
return null;
}
String title = "";
NodeList nl = rawDoc.getElementsByTagName("title");
if (nl.getLength() > 0) {
Element titleElement = ((Element) nl.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
}
}
return title;
|
public static void | main(java.lang.String[] args)Runs HtmlDocument on the files specified on
the command line.
// HtmlDocument doc = new HtmlDocument(new File(args[0]));
// System.out.println("Title = " + doc.getTitle());
// System.out.println("Body = " + doc.getBody());
HtmlDocument doc =
new HtmlDocument(new FileInputStream(new File(args[0])));
System.out.println("Title = " + doc.getTitle());
System.out.println("Body = " + doc.getBody());
|