FileDocCategorySizeDatePackage
TrecDocMaker.javaAPI DocApache Lucene 2.1.06323Wed Feb 14 10:46:16 GMT 2007org.apache.lucene.benchmark.byTask.feeds

TrecDocMaker

public class TrecDocMaker extends BasicDocMaker
A DocMaker using the (compressed) Trec collection for its input.

Fields Summary
private static final String
newline
private DateFormat
dateFormat
private File
dataDir
private ArrayList
inputFiles
private int
nextFile
private int
iteration
private BufferedReader
reader
private GZIPInputStream
zis
Constructors Summary
Methods Summary
private voidcloseInputs()

    if (zis!=null) {
      try {
        zis.close();
      } catch (IOException e) {
        System.out.println("closeInputs(): Ingnoring error: "+e);
        e.printStackTrace();
      }
      zis = null;
    }
    if (reader!=null) { 
      try {
        reader.close();
      } catch (IOException e) {
        System.out.println("closeInputs(): Ingnoring error: "+e);
        e.printStackTrace();
      }
      reader = null;
    }
  
protected DocDatagetNextDocData()

    if (reader==null) {
      openNextFile();
    }
    // 1. skip until doc start
    read("<DOC>",null,false,false); 
    // 2. name
    StringBuffer sb = read("<DOCNO>",null,true,false);
    String name = sb.substring("<DOCNO>".length());
    name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration;
    // 3. skip until doc header
    read("<DOCHDR>",null,false,false); 
    // 4. date
    sb = read("Date: ",null,true,false);
    String dateStr = sb.substring("Date: ".length());
    // 5. skip until end of doc header
    read("</DOCHDR>",null,false,false); 
    // 6. collect until end of doc
    sb = read("</DOC>",null,false,true);
    // this is the next document, so parse it  
    HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
    // title
    String title = p.getTitle();
    // properties 
    Properties props = p.getMetaTags(); 
    // body
    Reader r = p.getReader();
    char c[] = new char[1024];
    StringBuffer bodyBuf = new StringBuffer();
    int n;
    while ((n = r.read(c)) >= 0) {
      if (n>0) {
        bodyBuf.append(c,0,n);
      }
    }
    addBytes(bodyBuf.length());
    
    DocData dd = new DocData();
    
    dd.date = dateFormat.parse(dateStr.trim());
    dd.name = name;
    dd.title = title;
    dd.body = bodyBuf.toString();
    dd.props = props;
    return dd;
  
public intnumUniqueTexts()

    return inputFiles.size();
  
private voidopenNextFile()

    closeInputs();
    int retries = 0;
    while (retries<20) {
      File f = null;
      synchronized (this) {
        f = (File) inputFiles.get(nextFile++);
        if (nextFile >= inputFiles.size()) { 
          // exhausted files, start a new round
          nextFile = 0;
          iteration++;
        }
      }
      System.out.println("opening: "+f+" length: "+f.length());
      try {
        zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
        break;
      } catch (Exception e) {
        retries++;
        System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+"  #retries="+retries);
        continue;
      }
    }
    reader = new BufferedReader(new InputStreamReader(zis));
  
private java.lang.StringBufferread(java.lang.String prefix, java.lang.StringBuffer sb, boolean collectMatchLine, boolean collectAll)

    sb = (sb==null ? new StringBuffer() : sb);
    String sep = "";
    while (true) {
      String line = reader.readLine();
      if (line==null) {
        openNextFile();
        continue;
      }
      if (line.startsWith(prefix)) {
        if (collectMatchLine) {
          sb.append(sep+line);
          sep = newline;
        }
        break;
      }
      if (collectAll) {
        sb.append(sep+line);
        sep = newline;
      }
    }
    //System.out.println("read: "+sb);
    return sb;
  
public synchronized voidresetInputs()

    super.resetInputs();
    closeInputs();
    nextFile = 0;
    iteration = 0;
  
public voidsetConfig(org.apache.lucene.benchmark.byTask.utils.Config config)

  
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
      
    super.setConfig(config);
    String d = config.get("docs.dir","trec");
    dataDir = new File(new File("work"),d);
    collectFiles(dataDir,inputFiles);
    if (inputFiles.size()==0) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36.87
    dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US);  //Tue, 09 Dec 2003 22:39:08 GMT
    dateFormat.setLenient(true);