File Doc Category Size Date Package
StandardBenchmarker.java API Doc Apache Lucene 2.2.0 16469 Sat Jun 16 22:20:58 BST 2007 org.apache.lucene.benchmark.standard

StandardBenchmarker

java.lang.Object
- org.apache.lucene.benchmark.AbstractBenchmarker

public class StandardBenchmarker extends org.apache.lucene.benchmark.AbstractBenchmarker implements org.apache.lucene.benchmark.Benchmarker

Reads in the Reuters Collection, downloaded from http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz in the workingDir/reuters and indexes them using the {@link org.apache.lucene.analysis.standard.StandardAnalyzer}

Runs a standard set of documents through an Indexer and then runs a standard set of queries against the index.

see: org.apache.lucene.benchmark.standard.StandardBenchmarker#benchmark(java.io.File, org.apache.lucene.benchmark.BenchmarkOptions)
deprecated: use the byTask code instead. See http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/javadoc/org/apache/lucene/benchmark/byTask/package-summary.html .

Fields Summary
public static final String
SOURCE_DIR
public static final String
INDEX_DIR
private static DateFormat
format
Constructors Summary
public StandardBenchmarker()
//DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT); format.setLenient(true);
Methods Summary
public org.apache.lucene.benchmark.stats.TestData[] benchmark(java.io.File workingDir, org.apache.lucene.benchmark.BenchmarkOptions opts)
StandardOptions options = (StandardOptions) opts; workingDir.mkdirs(); File sourceDir = getSourceDirectory(workingDir); sourceDir.mkdirs(); File indexDir = new File(workingDir, INDEX_DIR); indexDir.mkdirs(); Analyzer a = new StandardAnalyzer(); List queryList = new ArrayList(20); queryList.addAll(Arrays.asList(ReutersQueries.STANDARD_QUERIES)); queryList.addAll(Arrays.asList(ReutersQueries.getPrebuiltQueries("body"))); Query[] qs = createQueries(queryList, a); // Here you can limit the set of query benchmarks QueryData[] qds = QueryData.getAll(qs); // Here you can narrow down the set of test parameters TestData[] params = TestData.getTestDataMinMaxMergeAndMaxBuffered(new File[]{sourceDir/*, jumboDir*/}, new Analyzer[]{a});//TestData.getAll(new File[]{sourceDir, jumboDir}, new Analyzer[]{a}); System.out.println("Testing " + params.length + " different permutations."); for (int i = 0; i < params.length; i++) { try { reset(indexDir); params[i].setDirectory(FSDirectory.getDirectory(indexDir)); params[i].setQueries(qds); System.out.println(params[i]); runBenchmark(params[i], options); // Here you can collect and output the runData for further processing. System.out.println(params[i].showRunData(params[i].getId())); //bench.runSearchBenchmark(queries, dir); params[i].getDirectory().close(); System.runFinalization(); System.gc(); } catch (Exception e) { e.printStackTrace(); System.out.println("EXCEPTION: " + e.getMessage()); //break; } } return params;
public static org.apache.lucene.search.Query[] createQueries(java.util.List qs, org.apache.lucene.analysis.Analyzer a)
Parse the strings containing Lucene queries.
param
qs array of strings containing query expressions
param
a analyzer to use when parsing queries
return
array of Lucene queries
QueryParser qp = new QueryParser("body", a); List queries = new ArrayList(); for (int i = 0; i < qs.size(); i++) { try { Object query = qs.get(i); Query q = null; if (query instanceof String) { q = qp.parse((String) query); } else if (query instanceof Query) { q = (Query) query; } else { System.err.println("Unsupported Query Type: " + query); } if (q != null) { queries.add(q); } } catch (Exception e) { e.printStackTrace(); } } return (Query[]) queries.toArray(new Query[0]);
public static void getAllFiles(java.io.File srcDir, java.io.FileFilter filter, java.util.List allFiles)
File [] files = srcDir.listFiles(filter); for (int i = 0; i < files.length; i++) { File file = files[i]; if (file.isDirectory()) { getAllFiles(file, filter, allFiles); } else { allFiles.add(file); } }
protected java.io.File getSourceDirectory(java.io.File workingDir)
return new File(workingDir, SOURCE_DIR);
protected org.apache.lucene.document.Document makeDocument(java.io.File in, java.lang.String[] tags, boolean stored, boolean tokenized, boolean tfv)
Parse the Reuters SGML and index: Date, Title, Dateline, Body
param
in input file
return
Lucene document
Document doc = new Document(); // tag this document if (tags != null) { for (int i = 0; i < tags.length; i++) { doc.add(new Field("tag" + i, tags[i], stored == true ? Field.Store.YES : Field.Store.NO, tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); } } doc.add(new Field("file", in.getCanonicalPath(), stored == true ? Field.Store.YES : Field.Store.NO, tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); BufferedReader reader = new BufferedReader(new FileReader(in)); String line = null; //First line is the date, 3rd is the title, rest is body String dateStr = reader.readLine(); reader.readLine();//skip an empty line String title = reader.readLine(); reader.readLine();//skip an empty line StringBuffer body = new StringBuffer(1024); while ((line = reader.readLine()) != null) { body.append(line).append(' "); } reader.close(); Date date = format.parse(dateStr.trim()); doc.add(new Field("date", DateTools.dateToString(date, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.UN_TOKENIZED)); if (title != null) { doc.add(new Field("title", title, stored == true ? Field.Store.YES : Field.Store.NO, tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); } if (body.length() > 0) { doc.add(new Field("body", body.toString(), stored == true ? Field.Store.YES : Field.Store.NO, tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); } return doc;
protected void makeIndex(org.apache.lucene.benchmark.stats.TestRunData trd, java.io.File srcDir, org.apache.lucene.index.IndexWriter iw, boolean stored, boolean tokenized, boolean tfv, StandardOptions options)
Make index, and collect time data.
param
trd run data to populate
param
srcDir directory with source files
param
iw index writer, already open
param
stored store values of fields
param
tokenized tokenize fields
param
tfv store term vectors
throws
Exception
//File[] groups = srcDir.listFiles(); List files = new ArrayList(); getAllFiles(srcDir, null, files); Document doc = null; long cnt = 0L; TimeData td = new TimeData(); td.name = "addDocument"; int scaleUp = options.getScaleUp(); int logStep = options.getLogStep(); int max = Math.min(files.size(), options.getMaximumDocumentsToIndex()); for (int s = 0; s < scaleUp; s++) { String[] tags = new String[]{srcDir.getName() + "/" + s}; int i = 0; for (Iterator iterator = files.iterator(); iterator.hasNext() && i < max; i++) { File file = (File) iterator.next(); doc = makeDocument(file, tags, stored, tokenized, tfv); td.start(); iw.addDocument(doc); td.stop(); cnt++; if (cnt % logStep == 0) { System.err.println(" - processed " + cnt + ", run id=" + trd.getId()); trd.addData(td); td.reset(); } } } trd.addData(td);
protected void reset(java.io.File indexDir)
Remove existing index.
throws
Exception
if (indexDir.exists()) { fullyDelete(indexDir); } indexDir.mkdirs();
protected void runBenchmark(org.apache.lucene.benchmark.stats.TestData params, StandardOptions options)
Run benchmark using supplied parameters.
param
params benchmark parameters
throws
Exception
System.out.println("Start Time: " + new Date()); int runCount = options.getRunCount(); for (int i = 0; i < runCount; i++) { TestRunData trd = new TestRunData(); trd.startRun(); trd.setId(String.valueOf(i)); IndexWriter iw = new IndexWriter(params.getDirectory(), params.getAnalyzer(), true); iw.setMergeFactor(params.getMergeFactor()); iw.setMaxBufferedDocs(params.getMaxBufferedDocs()); iw.setUseCompoundFile(params.isCompound()); makeIndex(trd, params.getSource(), iw, true, true, false, options); if (params.isOptimize()) { TimeData td = new TimeData("optimize"); trd.addData(td); td.start(); iw.optimize(); td.stop(); trd.addData(td); } iw.close(); QueryData[] queries = params.getQueries(); if (queries != null) { IndexReader ir = null; IndexSearcher searcher = null; for (int k = 0; k < queries.length; k++) { QueryData qd = queries[k]; if (ir != null && qd.reopen) { searcher.close(); ir.close(); ir = null; searcher = null; } if (ir == null) { ir = IndexReader.open(params.getDirectory()); searcher = new IndexSearcher(ir); } Document doc = null; if (qd.warmup) { TimeData td = new TimeData(qd.id + "-warm"); for (int m = 0; m < ir.maxDoc(); m++) { td.start(); if (ir.isDeleted(m)) { td.stop(); continue; } doc = ir.document(m); td.stop(); } trd.addData(td); } TimeData td = new TimeData(qd.id + "-srch"); td.start(); Hits h = searcher.search(qd.q); //System.out.println("Hits Size: " + h.length() + " Query: " + qd.q); td.stop(); trd.addData(td); td = new TimeData(qd.id + "-trav"); if (h != null && h.length() > 0) { for (int m = 0; m < h.length(); m++) { td.start(); int id = h.id(m); if (qd.retrieve) { doc = ir.document(id); } td.stop(); } } trd.addData(td); } try { if (searcher != null) { searcher.close(); } } catch (Exception e) { } ; try { if (ir != null) { ir.close(); } } catch (Exception e) { } ; } trd.endRun(); params.getRunData().add(trd); //System.out.println(params[i].showRunData(params[i].getId())); //params.showRunData(params.getId()); } System.out.println("End Time: " + new Date());
protected void saveStream(java.io.InputStream is, java.io.File out, boolean closeInput)
Save a stream to a file.
param
is input stream
param
out output file
param
closeInput if true, close the input stream when done.
throws
Exception
byte[] buf = new byte[4096]; FileOutputStream fos = new FileOutputStream(out); int len = 0; long total = 0L; long time = System.currentTimeMillis(); long delta = time; while ((len = is.read(buf)) > 0) { fos.write(buf, 0, len); total += len; time = System.currentTimeMillis(); if (time - delta > 5000) { System.err.println(" - copied " + total / 1024 + " kB..."); delta = time; } } fos.flush(); fos.close(); if (closeInput) { is.close(); }