DemoHTMLParserpublic class DemoHTMLParser extends Object implements org.apache.lucene.benchmark.byTask.feeds.HTMLParserHTML Parser that is based on Lucene's demo HTML parser. |
Fields Summary |
---|
DateFormat | dateFormat |
Constructors Summary |
---|
public DemoHTMLParser()
dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT
dateFormat.setLenient(true);
|
Methods Summary |
---|
public DocData | parse(java.lang.String name, java.util.Date date, java.io.Reader reader, java.text.DateFormat dateFormat)
org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
// title
String title = p.getTitle();
// properties
Properties props = p.getMetaTags();
// body
Reader r = p.getReader();
char c[] = new char[1024];
StringBuffer bodyBuf = new StringBuffer();
int n;
while ((n = r.read(c)) >= 0) {
if (n>0) {
bodyBuf.append(c,0,n);
}
}
r.close();
if (date == null && props.getProperty("date")!=null) {
try {
date = dateFormat.parse(props.getProperty("date").trim());
} catch (ParseException e) {
// do not fail test just because a date could not be parsed
System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
date = new Date(); // now
}
}
return new DocData(name, bodyBuf.toString(), title, props, date);
| public DocData | parse(java.lang.String name, java.util.Date date, java.lang.StringBuffer inputText, java.text.DateFormat dateFormat)
// TODO Auto-generated method stub
return parse(name, date, new StringReader(inputText.toString()), dateFormat);
|
|