FileDocCategorySizeDatePackage
PatternAnalyzer.javaAPI DocApache Lucene 2.1.017574Wed Feb 14 10:46:24 GMT 2007org.apache.lucene.index.memory

PatternAnalyzer

public class PatternAnalyzer extends Analyzer
Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern} (with behaviour identical to {@link String#split(String)}), and that combines the functionality of {@link org.apache.lucene.analysis.LetterTokenizer}, {@link org.apache.lucene.analysis.LowerCaseTokenizer}, {@link org.apache.lucene.analysis.WhitespaceTokenizer}, {@link org.apache.lucene.analysis.StopFilter} into a single efficient multi-purpose class.

If you are unsure how exactly a regular expression should look like, consider prototyping by simply trying various expressions on some test texts via {@link String#split(String)}. Once you are satisfied, give that regex to PatternAnalyzer. Also see Java Regular Expression Tutorial.

This class can be considerably faster than the "normal" Lucene tokenizers. It can also serve as a building block in a compound Lucene {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this stemming example:

PatternAnalyzer pat = ...
TokenStream tokenStream = new SnowballFilter(
pat.tokenStream("content", "James is running round in the woods"),
"English"));
author
whoschek.AT.lbl.DOT.gov

Fields Summary
public static final Pattern
NON_WORD_PATTERN
"\\W+"; Divides text at non-letters (!Character.isLetter(c))
public static final Pattern
WHITESPACE_PATTERN
"\\s+"; Divides text at whitespaces (Character.isWhitespace(c))
private static final Set
EXTENDED_ENGLISH_STOP_WORDS
public static final PatternAnalyzer
DEFAULT_ANALYZER
A lower-casing word analyzer with English stop words (can be shared freely across threads without harm); global per class loader.
public static final PatternAnalyzer
EXTENDED_ANALYZER
A lower-casing word analyzer with extended English stop words (can be shared freely across threads without harm); global per class loader. The stop words are borrowed from http://thomas.loc.gov/home/stopwords.html, see http://thomas.loc.gov/home/all.about.inquery.html
private final Pattern
pattern
private final boolean
toLowerCase
private final Set
stopWords
Constructors Summary
public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords)
Constructs a new instance with the given parameters.

param
pattern a regular expression delimiting tokens
param
toLowerCase if true returns tokens after applying String.toLowerCase()
param
stopWords if non-null, ignores all tokens that are contained in the given stop set (after previously having applied toLowerCase() if applicable). For example, created via {@link StopFilter#makeStopSet(String[])}and/or {@link org.apache.lucene.analysis.WordlistLoader}as in WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt") or other stop words lists .

  
                                                                                                                                                                                               
         
    if (pattern == null) 
      throw new IllegalArgumentException("pattern must not be null");
    
    if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
    else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
    
    if (stopWords != null && stopWords.size() == 0) stopWords = null;
    
    this.pattern = pattern;
    this.toLowerCase = toLowerCase;
    this.stopWords = stopWords;
  
Methods Summary
private static booleaneq(java.lang.Object o1, java.lang.Object o2)
equality where o1 and/or o2 can be null

    return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
  
private static booleaneqPattern(java.util.regex.Pattern p1, java.util.regex.Pattern p2)
assumes p1 and p2 are not null

    return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
  
public booleanequals(java.lang.Object other)
Indicates whether some other object is "equal to" this one.

param
other the reference object with which to compare.
return
true if equal, false otherwise

    if (this  == other) return true;
    if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
    if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
    
    if (other instanceof PatternAnalyzer) {
      PatternAnalyzer p2 = (PatternAnalyzer) other;
      return 
        toLowerCase == p2.toLowerCase &&
        eqPattern(pattern, p2.pattern) &&
        eq(stopWords, p2.stopWords);
    }
    return false;
  
public inthashCode()
Returns a hash code value for the object.

return
the hash code.

    if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
    if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
    
    int h = 1;
    h = 31*h + pattern.pattern().hashCode();
    h = 31*h + pattern.flags();
    h = 31*h + (toLowerCase ? 1231 : 1237);
    h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
    return h;
  
private static java.util.SetmakeStopSet(java.lang.String[] stopWords)
somewhat oversized to minimize hash collisions

    Set stops = new HashSet(stopWords.length * 2, 0.3f); 
    stops.addAll(Arrays.asList(stopWords));
    return stops;
//    return Collections.unmodifiableSet(stops);
  
private static java.lang.StringtoString(java.io.Reader input)
Reads until end-of-stream and returns all read chars, finally closes the stream.

param
input the input stream
throws
IOException if an I/O error occurs while reading the stream

    try {
      int len = 256;
      char[] buffer = new char[len];
      char[] output = new char[len];
      
      len = 0;
      int n;
      while ((n = input.read(buffer)) >= 0) {
        if (len + n > output.length) { // grow capacity
          char[] tmp = new char[Math.max(output.length << 1, len + n)];
          System.arraycopy(output, 0, tmp, 0, len);
          System.arraycopy(buffer, 0, tmp, len, n);
          buffer = output; // use larger buffer for future larger bulk reads
          output = tmp;
        } else {
          System.arraycopy(buffer, 0, output, len, n);
        }
        len += n;
      }

      return new String(output, 0, len);
    } finally {
      if (input != null) input.close();
    }
  
public org.apache.lucene.analysis.TokenStreamtokenStream(java.lang.String fieldName, java.lang.String text)
Creates a token stream that tokenizes the given string into token terms (aka words).

param
fieldName the name of the field to tokenize (currently ignored).
param
text the string to tokenize
return
a new token stream

    // Ideally the Analyzer superclass should have a method with the same signature, 
    // with a default impl that simply delegates to the StringReader flavour. 
    if (text == null) 
      throw new IllegalArgumentException("text must not be null");
    
    TokenStream stream;
    if (pattern == NON_WORD_PATTERN) { // fast path
      stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
    }
    else if (pattern == WHITESPACE_PATTERN) { // fast path
      stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
    }
    else {
      stream = new PatternTokenizer(text, pattern, toLowerCase);
      if (stopWords != null) stream = new StopFilter(stream, stopWords);
    }
    
    return stream;
  
public org.apache.lucene.analysis.TokenStreamtokenStream(java.lang.String fieldName, java.io.Reader reader)
Creates a token stream that tokenizes all the text in the given Reader; This implementation forwards to tokenStream(String, String) and is less efficient than tokenStream(String, String).

param
fieldName the name of the field to tokenize (currently ignored).
param
reader the reader delivering the text
return
a new token stream

    if (reader instanceof FastStringReader) { // fast path
      return tokenStream(fieldName, ((FastStringReader)reader).getString());
    }
    
    try {
      String text = toString(reader);
      return tokenStream(fieldName, text);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }