FileDocCategorySizeDatePackage
MemoryIndexTest.javaAPI DocApache Lucene 2.0.014304Fri May 26 09:53:52 BST 2006org.apache.lucene.index.memory

MemoryIndexTest

public class MemoryIndexTest extends TestCase
Verifies that Lucene MemoryIndex and RAMDirectory have the same behaviour, returning the same results for any given query. Runs a set of queries against a set of files and compares results for identity. Can also be used as a simple benchmark.

Example usage:

cd lucene-cvs
java org.apache.lucene.index.memory.MemoryIndexTest 1 1 memram @testqueries.txt *.txt *.html *.xml xdocs/*.xml src/test/org/apache/lucene/queryParser/*.java
where testqueries.txt is a file with one query per line, such as:
#
# queries extracted from TestQueryParser.java
#
Apache
Apach~ AND Copy*

a AND b
(a AND b)
c OR (a AND b)
a AND NOT b
a AND -b
a AND !b
a && b
a && ! b

a OR b
a || b
a OR !b
a OR ! b
a OR -b

+term -term term
foo:term AND field:anotherTerm
term AND "phrase phrase"
"hello there"

germ term^2.0
(term)^2.0
(germ term)^2.0
term^2.0
term^2
"germ term"^2.0
"term germ"^2

(foo OR bar) AND (baz OR boo)
((a OR b) AND NOT c) OR d
+(apple "steve jobs") -(foo bar baz)
+title:(dog OR cat) -author:"bob dole"


a&b
a&&b
.NET

"term germ"~2
"term germ"~2 flork
"term"~2
"~2 germ"
"term germ"~2^2

3
term 1.0 1 2
term term1 term2

term*
term*^2
term~
term~0.7
term~^2
term^2~
term*germ
term*germ^3


term*
Term*
TERM*
term*
Term*
TERM*

// Then 'full' wildcard queries:
te?m
Te?m
TE?M
Te?m*gerM
te?m
Te?m
TE?M
Te?m*gerM

term term term
term +stop term
term -stop term
drop AND stop AND roll
term phrase term
term AND NOT phrase term
stop


[ a TO c]
[ a TO c ]
{ a TO c}
{ a TO c }
{ a TO c }^2.0
[ a TO c] OR bar
[ a TO c] AND bar
( bar blar { a TO c})
gack ( bar blar { a TO c})


+weltbank +worlbank
+weltbank\n+worlbank
weltbank \n+worlbank
weltbank \n +worlbank
+weltbank\r+worlbank
weltbank \r+worlbank
weltbank \r +worlbank
+weltbank\r\n+worlbank
weltbank \r\n+worlbank
weltbank \r\n +worlbank
weltbank \r \n +worlbank
+weltbank\t+worlbank
weltbank \t+worlbank
weltbank \t +worlbank


term term term
term +term term
term term +term
term +term +term
-term term term


on^1.0
"hello"^2.0
hello^2.0
"on"^1.0
the^3
author
whoschek.AT.lbl.DOT.gov

Fields Summary
private Analyzer
analyzer
private boolean
fastMode
private static final String
FIELD_NAME
private static final Charset
DEFAULT_PLATFORM_CHARSET
Constructors Summary
Methods Summary
private org.apache.lucene.document.DocumentcreateDocument(java.lang.String content)

		Document doc = new Document();
		doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
		return doc;
	
private MemoryIndexcreateMemoryIndex(org.apache.lucene.document.Document doc)

		MemoryIndex index = new MemoryIndex();
		Enumeration iter = doc.fields();
		while (iter.hasMoreElements()) {
			Field field = (Field) iter.nextElement();
			index.addField(field.name(), field.stringValue(), analyzer);
		}
		return index;
	
private org.apache.lucene.store.RAMDirectorycreateRAMIndex(org.apache.lucene.document.Document doc)

		RAMDirectory dir = new RAMDirectory();		
		IndexWriter writer = null;
		try {
			writer = new IndexWriter(dir, analyzer, true);
			writer.setMaxFieldLength(Integer.MAX_VALUE);
			writer.addDocument(doc);
			writer.optimize();
			return dir;
		} catch (IOException e) { // should never happen (RAMDirectory)
			throw new RuntimeException(e);
		} finally {
			try {
				if (writer != null) writer.close();
			} catch (IOException e) { // should never happen (RAMDirectory)
				throw new RuntimeException(e);
			}
		}
	
private intgetMemorySize(java.lang.Object index)

		if (index instanceof Directory) {
			try {
				Directory dir = (Directory) index;
				int size = 0;
				String[] fileNames = dir.list();
				for (int i=0; i < fileNames.length; i++) {
					size += dir.fileLength(fileNames[i]);
				}
				return size;
			}
			catch (IOException e) { // can never happen (RAMDirectory)
				throw new RuntimeException(e);
			}
		}
		else {
			return ((MemoryIndex) index).getMemorySize();
		}
	
static java.lang.String[]listFiles(java.lang.String[] fileNames)
returns all files matching the given file name patterns (quick n'dirty)

		LinkedHashSet allFiles = new LinkedHashSet();
		for (int i=0; i < fileNames.length; i++) {
			int k;
			if ((k = fileNames[i].indexOf("*")) < 0) {
				allFiles.add(fileNames[i]);
			} else {
				String prefix = fileNames[i].substring(0, k);
				if (prefix.length() == 0) prefix = ".";
				final String suffix = fileNames[i].substring(k+1);
				File[] files = new File(prefix).listFiles(new FilenameFilter() {
					public boolean accept(File dir, String name) {
						return name.endsWith(suffix);
					}
				});
				if (files != null) {
					for (int j=0; j < files.length; j++) {
						allFiles.add(files[j].getPath());
					}
				}
			}			
		}
		
		String[] result = new String[allFiles.size()];
		allFiles.toArray(result);
		return result;
	
public static voidmain(java.lang.String[] args)
Runs the tests and/or benchmark


	      
	       
		new MemoryIndexTest().run(args);		
	
private org.apache.lucene.search.QueryparseQuery(java.lang.String expression)

		QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
//		parser.setPhraseSlop(0);
		return parser.parse(expression);
	
private floatquery(java.lang.Object index, org.apache.lucene.search.Query query)

//		System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
		Searcher searcher = null;
		try {
			if (index instanceof Directory)
				searcher = new IndexSearcher((Directory)index);
			else 
				searcher = ((MemoryIndex) index).createSearcher();

			final float[] scores = new float[1]; // inits to 0.0f
			searcher.search(query, new HitCollector() {
				public void collect(int doc, float score) {
					scores[0] = score;
				}
			});
			float score = scores[0];
//			Hits hits = searcher.search(query);
//			float score = hits.length() > 0 ? hits.score(0) : 0.0f;
			return score;
		} catch (IOException e) { // should never happen (RAMDirectory)
			throw new RuntimeException(e);
		} finally {
			try {
				if (searcher != null) searcher.close();
			} catch (IOException e) { // should never happen (RAMDirectory)
				throw new RuntimeException(e);
			}
		}
	
private java.lang.String[]readLines(java.io.File file)

		BufferedReader reader = new BufferedReader(new InputStreamReader(
				new FileInputStream(file))); 
		ArrayList lines = new ArrayList();
		String line;	
		while ((line = reader.readLine()) != null) {
			String t = line.trim(); 
			if (t.length() > 0 && t.charAt(0) != '#" && (!t.startsWith("//"))) {
				lines.add(line);
			}
		}
		reader.close();
		
		String[] result = new String[lines.size()];
		lines.toArray(result);
		return result;
	
private voidrun(java.lang.String[] args)

		int k = -1;
		
		int iters = 1;
		if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
		
		int runs = 1;
		if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
		
		String cmd = "memram";
		if (args.length > ++k) cmd = args[k];
		boolean useMemIndex = cmd.indexOf("mem") >= 0;
		boolean useRAMIndex = cmd.indexOf("ram") >= 0;
		
		String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
		if (args.length > ++k) {
			String arg = args[k];
			if (arg.startsWith("@")) 
				queries = readLines(new File(arg.substring(1)));
			else
				queries = new String[] { arg };
		}
		
		File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
		if (args.length > ++k) {
			files = new File[args.length - k];
			for (int i=k; i < args.length; i++) {
				files[i-k] = new File(args[i]);
			}
		}
		
		boolean toLowerCase = true;
//		boolean toLowerCase = false;
//		Set stopWords = null;
		Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
		
		Analyzer[] analyzers = new Analyzer[] { 
				new SimpleAnalyzer(),
				new StopAnalyzer(),
				new StandardAnalyzer(),
				PatternAnalyzer.DEFAULT_ANALYZER,
//				new WhitespaceAnalyzer(),
//				new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
//				new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),				
//				new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
		};
		
		for (int iter=0; iter < iters; iter++) {
			System.out.println("\n########### iteration=" + iter);
			long start = System.currentTimeMillis();						
			long bytes = 0;
			
			for (int anal=0; anal < analyzers.length; anal++) {
				this.analyzer = analyzers[anal];
				
				for (int i=0; i < files.length; i++) {
					File file = files[i];
					if (!file.exists() || file.isDirectory()) continue; // ignore
					bytes += file.length();
					String text = toString(new FileInputStream(file), null);
					Document doc = createDocument(text);
					System.out.println("\n*********** FILE=" + file);
					
					for (int q=0; q < queries.length; q++) {
						try {
							Query query = parseQuery(queries[q]);
							
							for (int run=0; run < runs; run++) {
								float score1 = 0.0f; float score2 = 0.0f;
								if (useMemIndex) score1 = query(createMemoryIndex(doc), query); 
								if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
								if (useMemIndex && useRAMIndex) {
									System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
									if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
										throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
									}
								}
							}
						} catch (Throwable t) {
							if (t instanceof OutOfMemoryError) t.printStackTrace();
							System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
							throw t;
						}
					}
				}
			}
			long end = System.currentTimeMillis();
			System.out.println("\nsecs = " + ((end-start)/1000.0f));
			System.out.println("queries/sec= " + 
				(1.0f * runs * queries.length * analyzers.length * files.length 
						/ ((end-start)/1000.0f)));
			float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
			System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
		}
		
		if (useMemIndex && useRAMIndex) 
			System.out.println("No bug found. done.");
		else 
			System.out.println("Done benchmarking (without checking correctness).");
	
public voidtestMany()

		String[] files = listFiles(new String[] {
			"*.txt", "*.html", "*.xml", "xdocs/*.xml", 
			"src/java/test/org/apache/lucene/queryParser/*.java",
			"src/java/org/apache/lucene/index/memory/*.java",
		});
		System.out.println("files = " + java.util.Arrays.asList(files));
		String[] xargs = new String[] {
			"1", "1", "memram", 
			"@src/test/org/apache/lucene/index/memory/testqueries.txt",
		};
		String[] args = new String[xargs.length + files.length];
		System.arraycopy(xargs, 0, args, 0, xargs.length);
		System.arraycopy(files, 0, args, xargs.length, files.length);
		run(args);
	
private static byte[]toByteArray(java.io.InputStream input)

		try {
			// safe and fast even if input.available() behaves weird or buggy
			int len = Math.max(256, input.available());
			byte[] buffer = new byte[len];
			byte[] output = new byte[len];
			
			len = 0;
			int n;
			while ((n = input.read(buffer)) >= 0) {
				if (len + n > output.length) { // grow capacity
					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
					System.arraycopy(output, 0, tmp, 0, len);
					System.arraycopy(buffer, 0, tmp, len, n);
					buffer = output; // use larger buffer for future larger bulk reads
					output = tmp;
				} else {
					System.arraycopy(buffer, 0, output, len, n);
				}
				len += n;
			}

			if (len == output.length) return output;
			buffer = null; // help gc
			buffer = new byte[len];
			System.arraycopy(output, 0, buffer, 0, len);
			return buffer;
		} finally {
			if (input != null) input.close();
		}
	
private static java.lang.StringtoString(java.io.InputStream input, java.nio.charset.Charset charset)

	
	
	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
	         
		if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;			
		byte[] data = toByteArray(input);
		return charset.decode(ByteBuffer.wrap(data)).toString();