Methods Summary |
---|
protected synchronized void | addBytes(long n)
numBytes += n;
|
protected void | addUniqueBytes(long n)
numUniqueBytes += n;
|
protected void | collectFiles(java.io.File f, java.util.ArrayList inputFiles)
//System.out.println("Collect: "+f.getAbsolutePath());
if (!f.canRead()) {
return;
}
if (f.isDirectory()) {
File files[] = f.listFiles();
for (int i = 0; i < files.length; i++) {
collectFiles(files[i],inputFiles);
}
return;
}
inputFiles.add(f);
addUniqueBytes(f.length());
|
private org.apache.lucene.document.Document | createDocument(DocData docData, int size, int cnt)
int docid = incrNumDocsCreated();
Document doc = new Document();
doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
if (docData.getName()!=null) {
String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
}
if (docData.getDate()!=null) {
String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
}
if (docData.getTitle()!=null) {
doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
}
if (docData.getBody()!=null && docData.getBody().length()>0) {
String bdy;
if (size<=0 || size>=docData.getBody().length()) {
bdy = docData.getBody(); // use all
docData.setBody(""); // nothing left
} else {
// attempt not to break words - if whitespace found within next 20 chars...
for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
if (Character.isWhitespace(docData.getBody().charAt(n))) {
size = n;
break;
}
}
bdy = docData.getBody().substring(0,size); // use part
docData.setBody(docData.getBody().substring(size)); // some left
}
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
if (storeBytes == true) {
doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES));
}
}
if (docData.getProps()!=null) {
for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
String key = (String) it.next();
String val = (String) docData.getProps().get(key);
doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
}
docData.setProps(null);
}
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
return doc;
|
public synchronized long | getByteCount()
return numBytes;
|
public synchronized int | getCount()
return numDocsCreated;
|
public HTMLParser | getHtmlParser()
return htmlParser;
|
protected abstract DocData | getNextDocData()Return the data of the next document.
All current implementations can create docs forever.
When the input data is exhausted, input files are iterated.
This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
|
private synchronized int | incrNumDocsCreated()
return numDocsCreated++;
|
public org.apache.lucene.document.Document | makeDocument()
resetLeftovers();
DocData docData = getNextDocData();
Document doc = createDocument(docData,0,-1);
return doc;
|
public org.apache.lucene.document.Document | makeDocument(int size)
LeftOver lvr = (LeftOver) leftovr.get();
if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) {
resetLeftovers();
}
DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
int cnt = (lvr==null ? 0 : lvr.cnt);
while (dd.getBody()==null || dd.getBody().length()<size) {
DocData dd2 = dd;
dd = getNextDocData();
cnt = 0;
dd.setBody(dd2.getBody() + dd.getBody());
}
Document doc = createDocument(dd,size,cnt);
if (dd.getBody()==null || dd.getBody().length()==0) {
resetLeftovers();
} else {
if (lvr == null) {
lvr = new LeftOver();
leftovr.set(lvr);
}
lvr.docdata = dd;
lvr.cnt = ++cnt;
}
return doc;
|
public long | numUniqueBytes()
return numUniqueBytes;
|
public void | printDocStatistics()
boolean print = false;
String col = " ";
StringBuffer sb = new StringBuffer();
String newline = System.getProperty("line.separator");
sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
int nut = numUniqueTexts();
if (nut > lastPrintedNumUniqueTexts) {
print = true;
sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
lastPrintedNumUniqueTexts = nut;
}
long nub = numUniqueBytes();
if (nub > lastPrintedNumUniqueBytes) {
print = true;
sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
lastPrintedNumUniqueBytes = nub;
}
if (getCount()>0) {
print = true;
sb.append("num docs added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline);
sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline);
}
if (print) {
System.out.println(sb.append(newline).toString());
printNum++;
}
|
public synchronized void | resetInputs()
printDocStatistics();
numBytes = 0;
numDocsCreated = 0;
resetLeftovers();
|
private void | resetLeftovers()
leftovr.set(null);
|
public void | setConfig(org.apache.lucene.benchmark.byTask.utils.Config config)
this.config = config;
boolean stored = config.get("doc.stored",false);
boolean tokenized = config.get("doc.tokenized",true);
boolean termVec = config.get("doc.term.vector",false);
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
storeBytes = config.get("doc.store.body.bytes", false);
forever = config.get("doc.maker.forever",true);
|
public void | setHTMLParser(HTMLParser htmlParser)
this.htmlParser = htmlParser;
|