MemoryIndexTestpublic class MemoryIndexTest extends TestCase Verifies that Lucene MemoryIndex and RAMDirectory have the same behaviour,
returning the same results for any given query.
Runs a set of queries against a set of files and compares results for identity.
Can also be used as a simple benchmark.
Example usage:
cd lucene-svn
java -server -cp ~/unix/java/share/misc/junit/junit.jar:build/classes:build/lucene-core-2.1-dev.jar:build/contrib/memory/classes/test:build/contrib/memory/classes/java org.apache.lucene.index.memory.MemoryIndexTest 1 1 memram @contrib/memory/src/test/org/apache/lucene/index/memory/testqueries.txt *.txt *.html *.xml xdocs/*.xml src/test/org/apache/lucene/queryParser/*.java contrib/memory/src/java/org/apache/lucene/index/memory/*.java
where testqueries.txt is a file with one query per line, such as:
#
# queries extracted from TestQueryParser.java
#
Apache
Apach~ AND Copy*
a AND b
(a AND b)
c OR (a AND b)
a AND NOT b
a AND -b
a AND !b
a && b
a && ! b
a OR b
a || b
a OR !b
a OR ! b
a OR -b
+term -term term
foo:term AND field:anotherTerm
term AND "phrase phrase"
"hello there"
germ term^2.0
(term)^2.0
(germ term)^2.0
term^2.0
term^2
"germ term"^2.0
"term germ"^2
(foo OR bar) AND (baz OR boo)
((a OR b) AND NOT c) OR d
+(apple "steve jobs") -(foo bar baz)
+title:(dog OR cat) -author:"bob dole"
a&b
a&&b
.NET
"term germ"~2
"term germ"~2 flork
"term"~2
"~2 germ"
"term germ"~2^2
3
term 1.0 1 2
term term1 term2
term*
term*^2
term~
term~0.7
term~^2
term^2~
term*germ
term*germ^3
term*
Term*
TERM*
term*
Term*
TERM*
// Then 'full' wildcard queries:
te?m
Te?m
TE?M
Te?m*gerM
te?m
Te?m
TE?M
Te?m*gerM
term term term
term +stop term
term -stop term
drop AND stop AND roll
term phrase term
term AND NOT phrase term
stop
[ a TO c]
[ a TO c ]
{ a TO c}
{ a TO c }
{ a TO c }^2.0
[ a TO c] OR bar
[ a TO c] AND bar
( bar blar { a TO c})
gack ( bar blar { a TO c})
+weltbank +worlbank
+weltbank\n+worlbank
weltbank \n+worlbank
weltbank \n +worlbank
+weltbank\r+worlbank
weltbank \r+worlbank
weltbank \r +worlbank
+weltbank\r\n+worlbank
weltbank \r\n+worlbank
weltbank \r\n +worlbank
weltbank \r \n +worlbank
+weltbank\t+worlbank
weltbank \t+worlbank
weltbank \t +worlbank
term term term
term +term term
term term +term
term +term +term
-term term term
on^1.0
"hello"^2.0
hello^2.0
"on"^1.0
the^3
|
Fields Summary |
---|
private Analyzer | analyzer | private boolean | fastMode | private static final String | FIELD_NAME | private static final Charset | DEFAULT_PLATFORM_CHARSET |
Methods Summary |
---|
private org.apache.lucene.document.Document | createDocument(java.lang.String content)
Document doc = new Document();
doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
return doc;
| private MemoryIndex | createMemoryIndex(org.apache.lucene.document.Document doc)
MemoryIndex index = new MemoryIndex();
Enumeration iter = doc.fields();
while (iter.hasMoreElements()) {
Field field = (Field) iter.nextElement();
index.addField(field.name(), field.stringValue(), analyzer);
}
return index;
| private org.apache.lucene.store.RAMDirectory | createRAMIndex(org.apache.lucene.document.Document doc)
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = null;
try {
writer = new IndexWriter(dir, analyzer, true);
writer.setMaxFieldLength(Integer.MAX_VALUE);
writer.addDocument(doc);
writer.optimize();
return dir;
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
} finally {
try {
if (writer != null) writer.close();
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
| private int | getMemorySize(java.lang.Object index)
if (index instanceof Directory) {
try {
Directory dir = (Directory) index;
int size = 0;
String[] fileNames = dir.list();
for (int i=0; i < fileNames.length; i++) {
size += dir.fileLength(fileNames[i]);
}
return size;
}
catch (IOException e) { // can never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
else {
return ((MemoryIndex) index).getMemorySize();
}
| static java.lang.String[] | listFiles(java.lang.String[] fileNames)returns all files matching the given file name patterns (quick n'dirty)
LinkedHashSet allFiles = new LinkedHashSet();
for (int i=0; i < fileNames.length; i++) {
int k;
if ((k = fileNames[i].indexOf("*")) < 0) {
allFiles.add(fileNames[i]);
} else {
String prefix = fileNames[i].substring(0, k);
if (prefix.length() == 0) prefix = ".";
final String suffix = fileNames[i].substring(k+1);
File[] files = new File(prefix).listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(suffix);
}
});
if (files != null) {
for (int j=0; j < files.length; j++) {
allFiles.add(files[j].getPath());
}
}
}
}
String[] result = new String[allFiles.size()];
allFiles.toArray(result);
return result;
| public static void | main(java.lang.String[] args)Runs the tests and/or benchmark
new MemoryIndexTest().run(args);
| private org.apache.lucene.search.Query | parseQuery(java.lang.String expression)
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
// parser.setPhraseSlop(0);
return parser.parse(expression);
| private float | query(java.lang.Object index, org.apache.lucene.search.Query query)
// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
Searcher searcher = null;
try {
if (index instanceof Directory)
searcher = new IndexSearcher((Directory)index);
else
searcher = ((MemoryIndex) index).createSearcher();
final float[] scores = new float[1]; // inits to 0.0f
searcher.search(query, new HitCollector() {
public void collect(int doc, float score) {
scores[0] = score;
}
});
float score = scores[0];
// Hits hits = searcher.search(query);
// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
return score;
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
} finally {
try {
if (searcher != null) searcher.close();
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
| private java.lang.String[] | readLines(java.io.File file)
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file)));
ArrayList lines = new ArrayList();
String line;
while ((line = reader.readLine()) != null) {
String t = line.trim();
if (t.length() > 0 && t.charAt(0) != '#" && (!t.startsWith("//"))) {
lines.add(line);
}
}
reader.close();
String[] result = new String[lines.size()];
lines.toArray(result);
return result;
| private void | run(java.lang.String[] args)
int k = -1;
int iters = 1;
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
int runs = 1;
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
String cmd = "memram";
if (args.length > ++k) cmd = args[k];
boolean useMemIndex = cmd.indexOf("mem") >= 0;
boolean useRAMIndex = cmd.indexOf("ram") >= 0;
String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
if (args.length > ++k) {
String arg = args[k];
if (arg.startsWith("@"))
queries = readLines(new File(arg.substring(1)));
else
queries = new String[] { arg };
}
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
if (args.length > ++k) {
files = new File[args.length - k];
for (int i=k; i < args.length; i++) {
files[i-k] = new File(args[i]);
}
}
boolean toLowerCase = true;
// boolean toLowerCase = false;
// Set stopWords = null;
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
Analyzer[] analyzers = new Analyzer[] {
new SimpleAnalyzer(),
new StopAnalyzer(),
new StandardAnalyzer(),
PatternAnalyzer.DEFAULT_ANALYZER,
// new WhitespaceAnalyzer(),
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
};
for (int iter=0; iter < iters; iter++) {
System.out.println("\n########### iteration=" + iter);
long start = System.currentTimeMillis();
long bytes = 0;
for (int anal=0; anal < analyzers.length; anal++) {
this.analyzer = analyzers[anal];
for (int i=0; i < files.length; i++) {
File file = files[i];
if (!file.exists() || file.isDirectory()) continue; // ignore
bytes += file.length();
String text = toString(new FileInputStream(file), null);
Document doc = createDocument(text);
System.out.println("\n*********** FILE=" + file);
for (int q=0; q < queries.length; q++) {
try {
Query query = parseQuery(queries[q]);
boolean measureIndexing = false; // toggle this to measure query performance
MemoryIndex memind = null;
if (useMemIndex && !measureIndexing) memind = createMemoryIndex(doc);
RAMDirectory ramind = null;
if (useRAMIndex && !measureIndexing) ramind = createRAMIndex(doc);
for (int run=0; run < runs; run++) {
float score1 = 0.0f; float score2 = 0.0f;
if (useMemIndex && measureIndexing) memind = createMemoryIndex(doc);
if (useMemIndex) score1 = query(memind, query);
if (useRAMIndex && measureIndexing) ramind = createRAMIndex(doc);
if (useRAMIndex) score2 = query(ramind, query);
if (useMemIndex && useRAMIndex) {
System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
}
}
}
} catch (Throwable t) {
if (t instanceof OutOfMemoryError) t.printStackTrace();
System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
throw t;
}
}
}
}
long end = System.currentTimeMillis();
System.out.println("\nsecs = " + ((end-start)/1000.0f));
System.out.println("queries/sec= " +
(1.0f * runs * queries.length * analyzers.length * files.length
/ ((end-start)/1000.0f)));
float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
}
if (useMemIndex && useRAMIndex)
System.out.println("No bug found. done.");
else
System.out.println("Done benchmarking (without checking correctness).");
| public void | testMany()
String[] files = listFiles(new String[] {
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
"src/java/test/org/apache/lucene/queryParser/*.java",
"src/java/org/apache/lucene/index/memory/*.java",
});
System.out.println("files = " + java.util.Arrays.asList(files));
String[] xargs = new String[] {
"1", "1", "memram",
"@src/test/org/apache/lucene/index/memory/testqueries.txt",
};
String[] args = new String[xargs.length + files.length];
System.arraycopy(xargs, 0, args, 0, xargs.length);
System.arraycopy(files, 0, args, xargs.length, files.length);
run(args);
| private static byte[] | toByteArray(java.io.InputStream input)
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
| private static java.lang.String | toString(java.io.InputStream input, java.nio.charset.Charset charset)
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
byte[] data = toByteArray(input);
return charset.decode(ByteBuffer.wrap(data)).toString();
|
|