FileDocCategorySizeDatePackage
PatternAnalyzerTest.javaAPI DocApache Lucene 1.910112Mon Feb 20 09:18:34 GMT 2006org.apache.lucene.index.memory

PatternAnalyzerTest

public class PatternAnalyzerTest extends TestCase
Verifies that Lucene PatternAnalyzer and normal Lucene Analyzers have the same behaviour, returning the same results for any given free text. Runs a set of texts against a tokenizers/analyzers Can also be used as a simple benchmark.

Example usage:

cd lucene-cvs
java org.apache.lucene.index.memory.PatternAnalyzerTest 1 1 patluc 1 2 2 *.txt *.xml docs/*.html src/java/org/apache/lucene/index/*.java xdocs/*.xml ../nux/samples/data/*.xml
with WhitespaceAnalyzer problems can be found; These are not bugs but questionable Lucene features: CharTokenizer.MAX_WORD_LEN = 255. Thus the PatternAnalyzer produces correct output, whereas the WhitespaceAnalyzer silently truncates text, and so the comparison results in assertEquals() don't match up.
author
whoschek.AT.lbl.DOT.gov

Fields Summary
private static final Charset
DEFAULT_PLATFORM_CHARSET
Constructors Summary
Methods Summary
private voidassertEquals(java.util.List tokens1, java.util.List tokens2)

		int size = Math.min(tokens1.size(), tokens2.size());
		int i=0;
		try {
			for (; i < size; i++) {
				Token t1 = (Token) tokens1.get(i);
				Token t2 = (Token) tokens2.get(i);
				if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
				if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
				if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
				if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
			}
			if (tokens1.size() != tokens2.size()) 	throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
		}

		catch (IllegalStateException e) {
			if (size > 0) {
				System.out.println("i=" + i + ", size=" + size);
				System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
				System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
			}
			throw e;
		}
	
private java.util.ListgetTokens(org.apache.lucene.analysis.TokenStream stream)

		ArrayList tokens = new ArrayList();
		Token token;
		while ((token = stream.next()) != null) {
			tokens.add(token);
		}
		return tokens;
	
private org.apache.lucene.analysis.TokenStreamluceneTokenStream(java.lang.String text, boolean letters, boolean toLowerCase, java.util.Set stopWords)

		TokenStream stream;
		if (letters) 
			stream = new LetterTokenizer(new StringReader(text));
		else
			stream = new WhitespaceTokenizer(new StringReader(text));
		if (toLowerCase)	stream = new LowerCaseFilter(stream);
		if (stopWords != null) stream = new StopFilter(stream, stopWords);
		return stream;						
	
public static voidmain(java.lang.String[] args)
Runs the tests and/or benchmark

		new PatternAnalyzerTest().run(args);		
	
private org.apache.lucene.analysis.TokenStreampatternTokenStream(java.lang.String text, boolean letters, boolean toLowerCase, java.util.Set stopWords)

		Pattern pattern;
		if (letters) 
			pattern = PatternAnalyzer.NON_WORD_PATTERN;
		else 							
			pattern = PatternAnalyzer.WHITESPACE_PATTERN;
		PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
		return analyzer.tokenStream("", text);
	
private voidrun(java.lang.String[] args)

		int k = -1;
		
		int iters = 1;
		if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
		
		int runs = 1;
		if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
		
		String cmd = "patluc";
		if (args.length > ++k) cmd = args[k];
		boolean usePattern = cmd.indexOf("pat") >= 0;
		boolean useLucene  = cmd.indexOf("luc") >= 0;
		
		int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
		if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
		
		int maxToLower = 2;
		if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);

		int maxStops = 2;
		if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
		
		File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
		if (args.length > ++k) {
			files = new File[args.length - k];
			for (int i=k; i < args.length; i++) {
				files[i-k] = new File(args[i]);
			}
		}
		
		for (int iter=0; iter < iters; iter++) {
			System.out.println("\n########### iteration=" + iter);
			long start = System.currentTimeMillis();						
			long bytes = 0;
			
			for (int i=0; i < files.length; i++) {
				File file = files[i];
				if (!file.exists() || file.isDirectory()) continue; // ignore
				bytes += file.length();
				String text = toString(new FileInputStream(file), null);
				System.out.println("\n*********** FILE=" + file);

				for (int letters=0; letters < maxLetters; letters++) {
					boolean lettersOnly = letters == 0;
					
					for (int stops=0; stops < maxStops; stops++) {
						Set stopWords = null;
						if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
								
						for (int toLower=0; toLower < maxToLower; toLower++) {
							boolean toLowerCase = toLower != 0;
								
							for (int run=0; run < runs; run++) {
								List tokens1 = null; List tokens2 = null;
								try {
									if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
									if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));					
									if (usePattern && useLucene) assertEquals(tokens1, tokens2);
								} catch (Throwable t) {
									if (t instanceof OutOfMemoryError) t.printStackTrace();
									System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
									System.out.println("\n\ntokens1=" + toString(tokens1));
									System.out.println("\n\ntokens2=" + toString(tokens2));
									throw t;
								}
							}
						}
					}
				}
				long end = System.currentTimeMillis();
				System.out.println("\nsecs = " + ((end-start)/1000.0f));
				System.out.println("files/sec= " + 
						(1.0f * runs * maxLetters * maxToLower * maxStops * files.length 
						/ ((end-start)/1000.0f)));
				float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
				System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
			}
		}
		
		if (usePattern && useLucene) 
			System.out.println("No bug found. done.");
		else 
			System.out.println("Done benchmarking (without checking correctness).");
	
public voidtestMany()

		String[] files = MemoryIndexTest.listFiles(new String[] {
			"*.txt", "*.html", "*.xml", "xdocs/*.xml", 
			"src/test/org/apache/lucene/queryParser/*.java",
			"src/org/apache/lucene/index/memory/*.java",
		});
		System.out.println("files = " + java.util.Arrays.asList(files));
		String[] xargs = new String[] {
			"1", "1", "patluc", "1", "2", "2",
		};
		String[] args = new String[xargs.length + files.length];
		System.arraycopy(xargs, 0, args, 0, xargs.length);
		System.arraycopy(files, 0, args, xargs.length, files.length);
		run(args);
	
private static byte[]toByteArray(java.io.InputStream input)

		try {
			// safe and fast even if input.available() behaves weird or buggy
			int len = Math.max(256, input.available());
			byte[] buffer = new byte[len];
			byte[] output = new byte[len];
			
			len = 0;
			int n;
			while ((n = input.read(buffer)) >= 0) {
				if (len + n > output.length) { // grow capacity
					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
					System.arraycopy(output, 0, tmp, 0, len);
					System.arraycopy(buffer, 0, tmp, len, n);
					buffer = output; // use larger buffer for future larger bulk reads
					output = tmp;
				} else {
					System.arraycopy(buffer, 0, output, len, n);
				}
				len += n;
			}

			if (len == output.length) return output;
			buffer = null; // help gc
			buffer = new byte[len];
			System.arraycopy(output, 0, buffer, 0, len);
			return buffer;
		} finally {
			if (input != null) input.close();
		}
	
private java.lang.StringtoString(java.util.List tokens)

		if (tokens == null) return "null";
		String str = "[";
		for (int i=0; i < tokens.size(); i++) {
			Token t1 = (Token) tokens.get(i);
			str = str + "'" + t1.termText() + "', ";
		}
		return str + "]";
	
private static java.lang.StringtoString(java.io.InputStream input, java.nio.charset.Charset charset)

	
	
	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
	         
		if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;			
		byte[] data = toByteArray(input);
		return charset.decode(ByteBuffer.wrap(data)).toString();