File Doc Category Size Date Package
BasicDocMaker.java API Doc Apache Lucene 2.2.0 10325 Sat Jun 16 22:20:58 BST 2007 org.apache.lucene.benchmark.byTask.feeds

BasicDocMaker

java.lang.Object

public abstract class BasicDocMaker extends Object implements DocMaker

Create documents for the test. Maintains counters of chars etc. so that sub-classes just need to provide textual content, and the create-by-size is handled here.

Config Params (default is in caps): doc.stored=true|FALSE
doc.tokenized=TRUE|false
doc.term.vector=true|FALSE
doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field

Fields Summary
private int
numDocsCreated
private boolean
storeBytes
protected boolean
forever
private ThreadLocal
leftovr
static final String
BODY_FIELD
private long
numBytes
private long
numUniqueBytes
protected org.apache.lucene.benchmark.byTask.utils.Config
config
protected Field$Store
storeVal
protected Field$Index
indexVal
protected Field$TermVector
termVecVal
private int
lastPrintedNumUniqueTexts
private long
lastPrintedNumUniqueBytes
private int
printNum
private HTMLParser
htmlParser
Constructors Summary
Methods Summary
protected synchronized void addBytes(long n)
numBytes += n;
protected void addUniqueBytes(long n)
numUniqueBytes += n;
protected void collectFiles(java.io.File f, java.util.ArrayList inputFiles)
//System.out.println("Collect: "+f.getAbsolutePath()); if (!f.canRead()) { return; } if (f.isDirectory()) { File files[] = f.listFiles(); for (int i = 0; i < files.length; i++) { collectFiles(files[i],inputFiles); } return; } inputFiles.add(f); addUniqueBytes(f.length());
private org.apache.lucene.document.Document createDocument(DocData docData, int size, int cnt)
int docid = incrNumDocsCreated(); Document doc = new Document(); doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal)); if (docData.getName()!=null) { String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt); doc.add(new Field("docname", name, storeVal, indexVal, termVecVal)); } if (docData.getDate()!=null) { String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND); doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal)); } if (docData.getTitle()!=null) { doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal)); } if (docData.getBody()!=null && docData.getBody().length()>0) { String bdy; if (size<=0 || size>=docData.getBody().length()) { bdy = docData.getBody(); // use all docData.setBody(""); // nothing left } else { // attempt not to break words - if whitespace found within next 20 chars... for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) { if (Character.isWhitespace(docData.getBody().charAt(n))) { size = n; break; } } bdy = docData.getBody().substring(0,size); // use part docData.setBody(docData.getBody().substring(size)); // some left } doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal)); if (storeBytes == true) { doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES)); } } if (docData.getProps()!=null) { for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); String val = (String) docData.getProps().get(key); doc.add(new Field(key, val, storeVal, indexVal, termVecVal)); } docData.setProps(null); } //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc;
public synchronized long getByteCount()
return numBytes;
public synchronized int getCount()
return numDocsCreated;
public HTMLParser getHtmlParser()
return htmlParser;
protected abstract DocData getNextDocData()
Return the data of the next document. All current implementations can create docs forever. When the input data is exhausted, input files are iterated. This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
return
data of the next document.
exception
if cannot create the next doc data
exception
NoMoreDataException if data is exhausted (and 'forever' set to false).
private synchronized int incrNumDocsCreated()
return numDocsCreated++;
public org.apache.lucene.document.Document makeDocument()
resetLeftovers(); DocData docData = getNextDocData(); Document doc = createDocument(docData,0,-1); return doc;
public org.apache.lucene.document.Document makeDocument(int size)
LeftOver lvr = (LeftOver) leftovr.get(); if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) { resetLeftovers(); } DocData dd = (lvr==null ? getNextDocData() : lvr.docdata); int cnt = (lvr==null ? 0 : lvr.cnt); while (dd.getBody()==null || dd.getBody().length()<size) { DocData dd2 = dd; dd = getNextDocData(); cnt = 0; dd.setBody(dd2.getBody() + dd.getBody()); } Document doc = createDocument(dd,size,cnt); if (dd.getBody()==null || dd.getBody().length()==0) { resetLeftovers(); } else { if (lvr == null) { lvr = new LeftOver(); leftovr.set(lvr); } lvr.docdata = dd; lvr.cnt = ++cnt; } return doc;
public long numUniqueBytes()
return numUniqueBytes;
public void printDocStatistics()
boolean print = false; String col = " "; StringBuffer sb = new StringBuffer(); String newline = System.getProperty("line.separator"); sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline); int nut = numUniqueTexts(); if (nut > lastPrintedNumUniqueTexts) { print = true; sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline); lastPrintedNumUniqueTexts = nut; } long nub = numUniqueBytes(); if (nub > lastPrintedNumUniqueBytes) { print = true; sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline); lastPrintedNumUniqueBytes = nub; } if (getCount()>0) { print = true; sb.append("num docs added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline); sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline); } if (print) { System.out.println(sb.append(newline).toString()); printNum++; }
public synchronized void resetInputs()
printDocStatistics(); numBytes = 0; numDocsCreated = 0; resetLeftovers();
private void resetLeftovers()
leftovr.set(null);
public void setConfig(org.apache.lucene.benchmark.byTask.utils.Config config)
this.config = config; boolean stored = config.get("doc.stored",false); boolean tokenized = config.get("doc.tokenized",true); boolean termVec = config.get("doc.term.vector",false); storeVal = (stored ? Field.Store.YES : Field.Store.NO); indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED); termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO); storeBytes = config.get("doc.store.body.bytes", false); forever = config.get("doc.maker.forever",true);
public void setHTMLParser(HTMLParser htmlParser)
this.htmlParser = htmlParser;