FileDocCategorySizeDatePackage
ReutersDocMaker.javaAPI DocApache Lucene 2.1.03446Wed Feb 14 10:46:16 GMT 2007org.apache.lucene.benchmark.byTask.feeds

ReutersDocMaker

public class ReutersDocMaker extends BasicDocMaker
A DocMaker using the Reuters collection for its input.

Fields Summary
private DateFormat
dateFormat
private File
dataDir
private ArrayList
inputFiles
private int
nextFile
private int
iteration
Constructors Summary
Methods Summary
protected DocDatagetNextDocData()

    File f = null;
    String name = null;
    synchronized (this) {
      f = (File) inputFiles.get(nextFile++);
      name = f.getCanonicalPath()+"_"+iteration;
      if (nextFile >= inputFiles.size()) { 
        // exhausted files, start a new round
        nextFile = 0;
        iteration++;
      }
    }
    
    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    //First line is the date, 3rd is the title, rest is body
    String dateStr = reader.readLine();
    reader.readLine();//skip an empty line
    String title = reader.readLine();
    reader.readLine();//skip an empty line
    StringBuffer bodyBuf = new StringBuffer(1024);
    while ((line = reader.readLine()) != null) {
      bodyBuf.append(line).append(' ");
    }
    
    addBytes(f.length());

    DocData dd = new DocData();
    
    dd.date = dateFormat.parse(dateStr.trim());
    dd.name = name;
    dd.title = title;
    dd.body = bodyBuf.toString();
    return dd;
  
public intnumUniqueTexts()

    return inputFiles.size();
  
public synchronized voidresetInputs()

    super.resetInputs();
    nextFile = 0;
    iteration = 0;
  
public voidsetConfig(org.apache.lucene.benchmark.byTask.utils.Config config)

  
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
      
    super.setConfig(config);
    String d = config.get("docs.dir","reuters-out");
    dataDir = new File(new File("work"),d);
    collectFiles(dataDir,inputFiles);
    if (inputFiles.size()==0) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36.87
    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
    dateFormat.setLenient(true);