FileDocCategorySizeDatePackage
BasicDocMaker.javaAPI DocApache Lucene 2.2.010325Sat Jun 16 22:20:58 BST 2007org.apache.lucene.benchmark.byTask.feeds

BasicDocMaker

public abstract class BasicDocMaker extends Object implements DocMaker
Create documents for the test. Maintains counters of chars etc. so that sub-classes just need to provide textual content, and the create-by-size is handled here.

Config Params (default is in caps): doc.stored=true|FALSE
doc.tokenized=TRUE|false
doc.term.vector=true|FALSE
doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field

Fields Summary
private int
numDocsCreated
private boolean
storeBytes
protected boolean
forever
private ThreadLocal
leftovr
static final String
BODY_FIELD
private long
numBytes
private long
numUniqueBytes
protected org.apache.lucene.benchmark.byTask.utils.Config
config
protected Field$Store
storeVal
protected Field$Index
indexVal
protected Field$TermVector
termVecVal
private int
lastPrintedNumUniqueTexts
private long
lastPrintedNumUniqueBytes
private int
printNum
private HTMLParser
htmlParser
Constructors Summary
Methods Summary
protected synchronized voidaddBytes(long n)

    numBytes += n;
  
protected voidaddUniqueBytes(long n)

    numUniqueBytes += n;
  
protected voidcollectFiles(java.io.File f, java.util.ArrayList inputFiles)

    //System.out.println("Collect: "+f.getAbsolutePath());
    if (!f.canRead()) {
      return;
    }
    if (f.isDirectory()) {
      File files[] = f.listFiles();
      for (int i = 0; i < files.length; i++) {
        collectFiles(files[i],inputFiles);
      }
      return;
    }
    inputFiles.add(f);
    addUniqueBytes(f.length());
  
private org.apache.lucene.document.DocumentcreateDocument(DocData docData, int size, int cnt)

    int docid = incrNumDocsCreated();
    Document doc = new Document();
    doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
    if (docData.getName()!=null) {
      String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
      doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
    }
    if (docData.getDate()!=null) {
      String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
      doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
    }
    if (docData.getTitle()!=null) {
      doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
    }
    if (docData.getBody()!=null && docData.getBody().length()>0) {
      String bdy;
      if (size<=0 || size>=docData.getBody().length()) {
        bdy = docData.getBody(); // use all
        docData.setBody("");  // nothing left
      } else {
        // attempt not to break words - if whitespace found within next 20 chars...
        for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
          if (Character.isWhitespace(docData.getBody().charAt(n))) {
            size = n;
            break;
          }
        }
        bdy = docData.getBody().substring(0,size); // use part
        docData.setBody(docData.getBody().substring(size)); // some left
      }
      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
      if (storeBytes == true) {
        doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES));
      }
    }

    if (docData.getProps()!=null) {
      for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
        String key = (String) it.next();
        String val = (String) docData.getProps().get(key);
        doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
      }
      docData.setProps(null);
    }
    //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
    return doc;
  
public synchronized longgetByteCount()

    return numBytes;
  
public synchronized intgetCount()

    return numDocsCreated;
  
public HTMLParsergetHtmlParser()

    return htmlParser;
  
protected abstract DocDatagetNextDocData()
Return the data of the next document. All current implementations can create docs forever. When the input data is exhausted, input files are iterated. This re-iteration can be avoided by setting doc.maker.forever to false (default is true).

return
data of the next document.
exception
if cannot create the next doc data
exception
NoMoreDataException if data is exhausted (and 'forever' set to false).

private synchronized intincrNumDocsCreated()

  
      
    return numDocsCreated++;
  
public org.apache.lucene.document.DocumentmakeDocument()

    resetLeftovers();
    DocData docData = getNextDocData();
    Document doc = createDocument(docData,0,-1);
    return doc;
  
public org.apache.lucene.document.DocumentmakeDocument(int size)

    LeftOver lvr = (LeftOver) leftovr.get();
    if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) {
      resetLeftovers();
    }
    DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
    int cnt = (lvr==null ? 0 : lvr.cnt);
    while (dd.getBody()==null || dd.getBody().length()<size) {
      DocData dd2 = dd;
      dd = getNextDocData();
      cnt = 0;
      dd.setBody(dd2.getBody() + dd.getBody());
    }
    Document doc = createDocument(dd,size,cnt);
    if (dd.getBody()==null || dd.getBody().length()==0) {
      resetLeftovers();
    } else {
      if (lvr == null) {
        lvr = new LeftOver();
        leftovr.set(lvr);
      }
      lvr.docdata = dd;
      lvr.cnt = ++cnt;
    }
    return doc;
  
public longnumUniqueBytes()

    return numUniqueBytes;
  
public voidprintDocStatistics()

  
     
    boolean print = false;
    String col = "                  ";
    StringBuffer sb = new StringBuffer();
    String newline = System.getProperty("line.separator");
    sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
    int nut = numUniqueTexts();
    if (nut > lastPrintedNumUniqueTexts) {
      print = true;
      sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
      lastPrintedNumUniqueTexts = nut;
    }
    long nub = numUniqueBytes();
    if (nub > lastPrintedNumUniqueBytes) {
      print = true;
      sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
      lastPrintedNumUniqueBytes = nub;
    }
    if (getCount()>0) {
      print = true;
      sb.append("num docs added since last inputs reset:   ").append(Format.format(0,getCount(),col)).append(newline);
      sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline);
    }
    if (print) {
      System.out.println(sb.append(newline).toString());
      printNum++;
    }
  
public synchronized voidresetInputs()

    printDocStatistics();
    numBytes = 0;
    numDocsCreated = 0;
    resetLeftovers();
  
private voidresetLeftovers()

    leftovr.set(null);
  
public voidsetConfig(org.apache.lucene.benchmark.byTask.utils.Config config)

    this.config = config;
    boolean stored = config.get("doc.stored",false); 
    boolean tokenized = config.get("doc.tokenized",true);
    boolean termVec = config.get("doc.term.vector",false);
    storeVal = (stored ? Field.Store.YES : Field.Store.NO);
    indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
    termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
    storeBytes = config.get("doc.store.body.bytes", false);
    forever = config.get("doc.maker.forever",true);
  
public voidsetHTMLParser(HTMLParser htmlParser)

    this.htmlParser = htmlParser;