File Doc Category Size Date Package
TrecDocMaker.java API Doc Apache Lucene 2.1.0 6323 Wed Feb 14 10:46:16 GMT 2007 org.apache.lucene.benchmark.byTask.feeds

TrecDocMaker

java.lang.Object
- BasicDocMaker

public class TrecDocMaker extends BasicDocMaker

A DocMaker using the (compressed) Trec collection for its input.

Fields Summary
private static final String
newline
private DateFormat
dateFormat
private File
dataDir
private ArrayList
inputFiles
private int
nextFile
private int
iteration
private BufferedReader
reader
private GZIPInputStream
zis
Constructors Summary
Methods Summary
private void closeInputs()
if (zis!=null) { try { zis.close(); } catch (IOException e) { System.out.println("closeInputs(): Ingnoring error: "+e); e.printStackTrace(); } zis = null; } if (reader!=null) { try { reader.close(); } catch (IOException e) { System.out.println("closeInputs(): Ingnoring error: "+e); e.printStackTrace(); } reader = null; }
protected DocData getNextDocData()
if (reader==null) { openNextFile(); } // 1. skip until doc start read("<DOC>",null,false,false); // 2. name StringBuffer sb = read("<DOCNO>",null,true,false); String name = sb.substring("<DOCNO>".length()); name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration; // 3. skip until doc header read("<DOCHDR>",null,false,false); // 4. date sb = read("Date: ",null,true,false); String dateStr = sb.substring("Date: ".length()); // 5. skip until end of doc header read("</DOCHDR>",null,false,false); // 6. collect until end of doc sb = read("</DOC>",null,false,true); // this is the next document, so parse it HTMLParser p = new HTMLParser(new StringReader(sb.toString())); // title String title = p.getTitle(); // properties Properties props = p.getMetaTags(); // body Reader r = p.getReader(); char c[] = new char[1024]; StringBuffer bodyBuf = new StringBuffer(); int n; while ((n = r.read(c)) >= 0) { if (n>0) { bodyBuf.append(c,0,n); } } addBytes(bodyBuf.length()); DocData dd = new DocData(); dd.date = dateFormat.parse(dateStr.trim()); dd.name = name; dd.title = title; dd.body = bodyBuf.toString(); dd.props = props; return dd;
public int numUniqueTexts()
return inputFiles.size();
private void openNextFile()
closeInputs(); int retries = 0; while (retries<20) { File f = null; synchronized (this) { f = (File) inputFiles.get(nextFile++); if (nextFile >= inputFiles.size()) { // exhausted files, start a new round nextFile = 0; iteration++; } } System.out.println("opening: "+f+" length: "+f.length()); try { zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))); break; } catch (Exception e) { retries++; System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries); continue; } } reader = new BufferedReader(new InputStreamReader(zis));
private java.lang.StringBuffer read(java.lang.String prefix, java.lang.StringBuffer sb, boolean collectMatchLine, boolean collectAll)
sb = (sb==null ? new StringBuffer() : sb); String sep = ""; while (true) { String line = reader.readLine(); if (line==null) { openNextFile(); continue; } if (line.startsWith(prefix)) { if (collectMatchLine) { sb.append(sep+line); sep = newline; } break; } if (collectAll) { sb.append(sep+line); sep = newline; } } //System.out.println("read: "+sb); return sb;
public synchronized void resetInputs()
super.resetInputs(); closeInputs(); nextFile = 0; iteration = 0;
public void setConfig(org.apache.lucene.benchmark.byTask.utils.Config config)
/* (non-Javadoc) * @see SimpleDocMaker#setConfig(java.util.Properties) */ super.setConfig(config); String d = config.get("docs.dir","trec"); dataDir = new File(new File("work"),d); collectFiles(dataDir,inputFiles); if (inputFiles.size()==0) { throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); } // date format: 30-MAR-1987 14:22:36.87 dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT dateFormat.setLenient(true);