FileDocCategorySizeDatePackage
PatternAnalyzerTest.javaAPI DocApache Lucene 2.1.011246Wed Feb 14 10:46:24 GMT 2007org.apache.lucene.index.memory

PatternAnalyzerTest

public class PatternAnalyzerTest extends TestCase
Verifies that Lucene PatternAnalyzer and normal Lucene Analyzers have the same behaviour, returning the same results for any given free text. Runs a set of texts against a tokenizers/analyzers Can also be used as a simple benchmark.

Example usage:

cd lucene-cvs
java org.apache.lucene.index.memory.PatternAnalyzerTest 1 1 patluc 1 2 2 *.txt *.xml docs/*.html src/java/org/apache/lucene/index/*.java xdocs/*.xml ../nux/samples/data/*.xml
with WhitespaceAnalyzer problems can be found; These are not bugs but questionable Lucene features: CharTokenizer.MAX_WORD_LEN = 255. Thus the PatternAnalyzer produces correct output, whereas the WhitespaceAnalyzer silently truncates text, and so the comparison results in assertEquals() don't match up.
author
whoschek.AT.lbl.DOT.gov

Fields Summary
private static final Charset
DEFAULT_PLATFORM_CHARSET
Constructors Summary
Methods Summary
private voidassertEquals(java.util.List tokens1, java.util.List tokens2)

    int size = Math.min(tokens1.size(), tokens2.size());
    int i=0;
    try {
      for (; i < size; i++) {
        Token t1 = (Token) tokens1.get(i);
        Token t2 = (Token) tokens2.get(i);
        if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
        if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
        if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
        if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
      }
      if (tokens1.size() != tokens2.size())   throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
    }

    catch (IllegalStateException e) {
      if (size > 0) {
        System.out.println("i=" + i + ", size=" + size);
        System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
        System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
      }
      throw e;
    }
  
private java.util.ListgetTokens(org.apache.lucene.analysis.TokenStream stream)

    ArrayList tokens = new ArrayList();
    Token token;
    while ((token = stream.next()) != null) {
      tokens.add(token);
    }
    return tokens;
  
private org.apache.lucene.analysis.TokenStreamluceneTokenStream(java.lang.String text, boolean letters, boolean toLowerCase, java.util.Set stopWords)

    TokenStream stream;
    if (letters) 
      stream = new LetterTokenizer(new StringReader(text));
    else
      stream = new WhitespaceTokenizer(new StringReader(text));
    if (toLowerCase)  stream = new LowerCaseFilter(stream);
    if (stopWords != null) stream = new StopFilter(stream, stopWords);
    return stream;            
  
public static voidmain(java.lang.String[] args)
Runs the tests and/or benchmark

    new PatternAnalyzerTest().run(args);    
  
private org.apache.lucene.analysis.TokenStreampatternTokenStream(java.lang.String text, boolean letters, boolean toLowerCase, java.util.Set stopWords)

    Pattern pattern;
    if (letters) 
      pattern = PatternAnalyzer.NON_WORD_PATTERN;
    else               
      pattern = PatternAnalyzer.WHITESPACE_PATTERN;
    PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
    return analyzer.tokenStream("", text);
  
private voidrun(java.lang.String[] args)

    int k = -1;
    
    int iters = 1;
    if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
    
    int runs = 1;
    if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
    
    String cmd = "patluc";
    if (args.length > ++k) cmd = args[k];
    boolean usePattern = cmd.indexOf("pat") >= 0;
    boolean useLucene  = cmd.indexOf("luc") >= 0;
    
    int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
    if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
    
    int maxToLower = 2;
    if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);

    int maxStops = 2;
    if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
    
    File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
    if (args.length > ++k) {
      files = new File[args.length - k];
      for (int i=k; i < args.length; i++) {
        files[i-k] = new File(args[i]);
      }
    }
    
    for (int iter=0; iter < iters; iter++) {
      System.out.println("\n########### iteration=" + iter);
      long start = System.currentTimeMillis();            
      long bytes = 0;
      
      for (int i=0; i < files.length; i++) {
        File file = files[i];
        if (!file.exists() || file.isDirectory()) continue; // ignore
        bytes += file.length();
        String text = toString(new FileInputStream(file), null);
        System.out.println("\n*********** FILE=" + file);

        for (int letters=0; letters < maxLetters; letters++) {
          boolean lettersOnly = letters == 0;
          
          for (int stops=0; stops < maxStops; stops++) {
            Set stopWords = null;
            if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
                
            for (int toLower=0; toLower < maxToLower; toLower++) {
              boolean toLowerCase = toLower != 0;
                
              for (int run=0; run < runs; run++) {
                List tokens1 = null; List tokens2 = null;
                try {
                  if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
                  if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));          
                  if (usePattern && useLucene) assertEquals(tokens1, tokens2);
                } catch (Throwable t) {
                  if (t instanceof OutOfMemoryError) t.printStackTrace();
                  System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
                  System.out.println("\n\ntokens1=" + toString(tokens1));
                  System.out.println("\n\ntokens2=" + toString(tokens2));
                  throw t;
                }
              }
            }
          }
        }
        long end = System.currentTimeMillis();
        System.out.println("\nsecs = " + ((end-start)/1000.0f));
        System.out.println("files/sec= " + 
            (1.0f * runs * maxLetters * maxToLower * maxStops * files.length 
            / ((end-start)/1000.0f)));
        float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
        System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
      }
    }
    
    if (usePattern && useLucene) 
      System.out.println("No bug found. done.");
    else 
      System.out.println("Done benchmarking (without checking correctness).");
  
public voidtestMany()

//    String[] files = MemoryIndexTest.listFiles(new String[] {
//      "*.txt", "*.html", "*.xml", "xdocs/*.xml", 
//      "src/test/org/apache/lucene/queryParser/*.java",
//      "src/org/apache/lucene/index/memory/*.java",
//    });
    String[] files = MemoryIndexTest.listFiles(new String[] {
      "../../*.txt", "../../*.html", "../../*.xml", "../../xdocs/*.xml", 
      "../../src/test/org/apache/lucene/queryParser/*.java",
      "src/java/org/apache/lucene/index/memory/*.java",
    });
    System.out.println("files = " + java.util.Arrays.asList(files));
    String[] xargs = new String[] {
      "1", "1", "patluc", "1", "2", "2",
    };
    String[] args = new String[xargs.length + files.length];
    System.arraycopy(xargs, 0, args, 0, xargs.length);
    System.arraycopy(files, 0, args, xargs.length, files.length);
    run(args);
  
private static byte[]toByteArray(java.io.InputStream input)

    try {
      // safe and fast even if input.available() behaves weird or buggy
      int len = Math.max(256, input.available());
      byte[] buffer = new byte[len];
      byte[] output = new byte[len];
      
      len = 0;
      int n;
      while ((n = input.read(buffer)) >= 0) {
        if (len + n > output.length) { // grow capacity
          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
          System.arraycopy(output, 0, tmp, 0, len);
          System.arraycopy(buffer, 0, tmp, len, n);
          buffer = output; // use larger buffer for future larger bulk reads
          output = tmp;
        } else {
          System.arraycopy(buffer, 0, output, len, n);
        }
        len += n;
      }

      if (len == output.length) return output;
      buffer = null; // help gc
      buffer = new byte[len];
      System.arraycopy(output, 0, buffer, 0, len);
      return buffer;
    } finally {
      if (input != null) input.close();
    }
  
private java.lang.StringtoString(java.util.List tokens)

    if (tokens == null) return "null";
    String str = "[";
    for (int i=0; i < tokens.size(); i++) {
      Token t1 = (Token) tokens.get(i);
      str = str + "'" + t1.termText() + "', ";
    }
    return str + "]";
  
private static java.lang.StringtoString(java.io.InputStream input, java.nio.charset.Charset charset)

  
  
  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
           
    if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;      
    byte[] data = toByteArray(input);
    return charset.decode(ByteBuffer.wrap(data)).toString();