FileDocCategorySizeDatePackage
ReutersDocMaker.javaAPI DocApache Lucene 2.2.03526Sat Jun 16 22:20:58 BST 2007org.apache.lucene.benchmark.byTask.feeds

ReutersDocMaker

public class ReutersDocMaker extends BasicDocMaker
A DocMaker using the Reuters collection for its input. Config properties: docs.dir=<path to the docs dir| Default: reuters-out>

Fields Summary
private DateFormat
dateFormat
private File
dataDir
private ArrayList
inputFiles
private int
nextFile
private int
iteration
Constructors Summary
Methods Summary
protected DocDatagetNextDocData()

    File f = null;
    String name = null;
    synchronized (this) {
      if (nextFile >= inputFiles.size()) { 
        // exhausted files, start a new round, unless forever set to false.
        if (!forever) {
          throw new NoMoreDataException();
        }
        nextFile = 0;
        iteration++;
      }
      f = (File) inputFiles.get(nextFile++);
      name = f.getCanonicalPath()+"_"+iteration;
    }
    
    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    //First line is the date, 3rd is the title, rest is body
    String dateStr = reader.readLine();
    reader.readLine();//skip an empty line
    String title = reader.readLine();
    reader.readLine();//skip an empty line
    StringBuffer bodyBuf = new StringBuffer(1024);
    while ((line = reader.readLine()) != null) {
      bodyBuf.append(line).append(' ");
    }
    reader.close();
    
    addBytes(f.length());

    
    Date date = dateFormat.parse(dateStr.trim()); 
    return new DocData(name, bodyBuf.toString(), title, null, date);
  
public intnumUniqueTexts()

    return inputFiles.size();
  
public synchronized voidresetInputs()

    super.resetInputs();
    nextFile = 0;
    iteration = 0;
  
public voidsetConfig(org.apache.lucene.benchmark.byTask.utils.Config config)

  
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
      
    super.setConfig(config);
    String d = config.get("docs.dir","reuters-out");
    dataDir = new File(new File("work"),d);


    collectFiles(dataDir,inputFiles);
    if (inputFiles.size()==0) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36.87
    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
    dateFormat.setLenient(true);