FileDocCategorySizeDatePackage
PatternAnalyzer.javaAPI DocApache Lucene 2.1.017574Wed Feb 14 10:46:24 GMT 2007org.apache.lucene.index.memory

PatternAnalyzer.java

package org.apache.lucene.index.memory;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

/**
 * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
 * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
 * (with behaviour identical to {@link String#split(String)}),
 * and that combines the functionality of
 * {@link org.apache.lucene.analysis.LetterTokenizer},
 * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
 * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
 * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
 * multi-purpose class.
 * <p>
 * If you are unsure how exactly a regular expression should look like, consider 
 * prototyping by simply trying various expressions on some test texts via
 * {@link String#split(String)}. Once you are satisfied, give that regex to 
 * PatternAnalyzer. Also see <a target="_blank" 
 * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
 * <p>
 * This class can be considerably faster than the "normal" Lucene tokenizers. 
 * It can also serve as a building block in a compound Lucene
 * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this 
 * stemming example:
 * <pre>
 * PatternAnalyzer pat = ...
 * TokenStream tokenStream = new SnowballFilter(
 *     pat.tokenStream("content", "James is running round in the woods"), 
 *     "English"));
 * </pre>
 * 
 * @author whoschek.AT.lbl.DOT.gov
 */
public class PatternAnalyzer extends Analyzer {
  
  /** <code>"\\W+"</code>; Divides text at non-letters (!Character.isLetter(c)) */
  public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
  
  /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
  public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
  
  private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
    "a", "about", "above", "across", "adj", "after", "afterwards",
    "again", "against", "albeit", "all", "almost", "alone", "along",
    "already", "also", "although", "always", "among", "amongst", "an",
    "and", "another", "any", "anyhow", "anyone", "anything",
    "anywhere", "are", "around", "as", "at", "be", "became", "because",
    "become", "becomes", "becoming", "been", "before", "beforehand",
    "behind", "being", "below", "beside", "besides", "between",
    "beyond", "both", "but", "by", "can", "cannot", "co", "could",
    "down", "during", "each", "eg", "either", "else", "elsewhere",
    "enough", "etc", "even", "ever", "every", "everyone", "everything",
    "everywhere", "except", "few", "first", "for", "former",
    "formerly", "from", "further", "had", "has", "have", "he", "hence",
    "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
    "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
    "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
    "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
    "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
    "must", "my", "myself", "namely", "neither", "never",
    "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
    "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once one", "only", "onto", "or", "other", "others", "otherwise",
    "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
    "rather", "s", "same", "seem", "seemed", "seeming", "seems",
    "several", "she", "should", "since", "so", "some", "somehow",
    "someone", "something", "sometime", "sometimes", "somewhere",
    "still", "such", "t", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefor", "therein", "thereupon", "these", "they", "this",
    "those", "though", "through", "throughout", "thru", "thus", "to",
    "together", "too", "toward", "towards", "under", "until", "up",
    "upon", "us", "very", "via", "was", "we", "well", "were", "what",
    "whatever", "whatsoever", "when", "whence", "whenever",
    "whensoever", "where", "whereafter", "whereas", "whereat",
    "whereby", "wherefrom", "wherein", "whereinto", "whereof",
    "whereon", "whereto", "whereunto", "whereupon", "wherever",
    "wherewith", "whether", "which", "whichever", "whichsoever",
    "while", "whilst", "whither", "who", "whoever", "whole", "whom",
    "whomever", "whomsoever", "whose", "whosoever", "why", "will",
    "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
    "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
    "yourselves"});
    
  /**
   * A lower-casing word analyzer with English stop words (can be shared
   * freely across threads without harm); global per class loader.
   */
  public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
    NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
    
  /**
   * A lower-casing word analyzer with <b>extended </b> English stop words
   * (can be shared freely across threads without harm); global per class
   * loader. The stop words are borrowed from
   * http://thomas.loc.gov/home/stopwords.html, see
   * http://thomas.loc.gov/home/all.about.inquery.html
   */
  public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
    NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
    
  private final Pattern pattern;
  private final boolean toLowerCase;
  private final Set stopWords;
  
  /**
   * Constructs a new instance with the given parameters.
   * 
   * @param pattern
   *            a regular expression delimiting tokens
   * @param toLowerCase
   *            if <code>true</code> returns tokens after applying
   *            String.toLowerCase()
   * @param stopWords
   *            if non-null, ignores all tokens that are contained in the
   *            given stop set (after previously having applied toLowerCase()
   *            if applicable). For example, created via
   *            {@link StopFilter#makeStopSet(String[])}and/or
   *            {@link org.apache.lucene.analysis.WordlistLoader}as in
   *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
   *            or <a href="http://www.unine.ch/info/clef/">other stop words
   *            lists </a>.
   */
  public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
    if (pattern == null) 
      throw new IllegalArgumentException("pattern must not be null");
    
    if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
    else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
    
    if (stopWords != null && stopWords.size() == 0) stopWords = null;
    
    this.pattern = pattern;
    this.toLowerCase = toLowerCase;
    this.stopWords = stopWords;
  }
  
  /**
   * Creates a token stream that tokenizes the given string into token terms
   * (aka words).
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
   * @param text
   *            the string to tokenize
   * @return a new token stream
   */
  public TokenStream tokenStream(String fieldName, String text) {
    // Ideally the Analyzer superclass should have a method with the same signature, 
    // with a default impl that simply delegates to the StringReader flavour. 
    if (text == null) 
      throw new IllegalArgumentException("text must not be null");
    
    TokenStream stream;
    if (pattern == NON_WORD_PATTERN) { // fast path
      stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
    }
    else if (pattern == WHITESPACE_PATTERN) { // fast path
      stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
    }
    else {
      stream = new PatternTokenizer(text, pattern, toLowerCase);
      if (stopWords != null) stream = new StopFilter(stream, stopWords);
    }
    
    return stream;
  }
  
  /**
   * Creates a token stream that tokenizes all the text in the given Reader;
   * This implementation forwards to <code>tokenStream(String, String)</code> and is
   * less efficient than <code>tokenStream(String, String)</code>.
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
   * @param reader
   *            the reader delivering the text
   * @return a new token stream
   */
  public TokenStream tokenStream(String fieldName, Reader reader) {
    if (reader instanceof FastStringReader) { // fast path
      return tokenStream(fieldName, ((FastStringReader)reader).getString());
    }
    
    try {
      String text = toString(reader);
      return tokenStream(fieldName, text);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  
  /**
   * Indicates whether some other object is "equal to" this one.
   * 
   * @param other
   *            the reference object with which to compare.
   * @return true if equal, false otherwise
   */
  public boolean equals(Object other) {
    if (this  == other) return true;
    if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
    if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
    
    if (other instanceof PatternAnalyzer) {
      PatternAnalyzer p2 = (PatternAnalyzer) other;
      return 
        toLowerCase == p2.toLowerCase &&
        eqPattern(pattern, p2.pattern) &&
        eq(stopWords, p2.stopWords);
    }
    return false;
  }
  
  /**
   * Returns a hash code value for the object.
   * 
   * @return the hash code.
   */
  public int hashCode() {
    if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
    if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
    
    int h = 1;
    h = 31*h + pattern.pattern().hashCode();
    h = 31*h + pattern.flags();
    h = 31*h + (toLowerCase ? 1231 : 1237);
    h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
    return h;
  }
  
  /** equality where o1 and/or o2 can be null */
  private static boolean eq(Object o1, Object o2) {
    return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
  }
  
  /** assumes p1 and p2 are not null */
  private static boolean eqPattern(Pattern p1, Pattern p2) {
    return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
  }
    
  /**
   * Reads until end-of-stream and returns all read chars, finally closes the stream.
   * 
   * @param input the input stream
   * @throws IOException if an I/O error occurs while reading the stream
   */
  private static String toString(Reader input) throws IOException {
    try {
      int len = 256;
      char[] buffer = new char[len];
      char[] output = new char[len];
      
      len = 0;
      int n;
      while ((n = input.read(buffer)) >= 0) {
        if (len + n > output.length) { // grow capacity
          char[] tmp = new char[Math.max(output.length << 1, len + n)];
          System.arraycopy(output, 0, tmp, 0, len);
          System.arraycopy(buffer, 0, tmp, len, n);
          buffer = output; // use larger buffer for future larger bulk reads
          output = tmp;
        } else {
          System.arraycopy(buffer, 0, output, len, n);
        }
        len += n;
      }

      return new String(output, 0, len);
    } finally {
      if (input != null) input.close();
    }
  }
    
  /** somewhat oversized to minimize hash collisions */
  private static Set makeStopSet(String[] stopWords) {
    Set stops = new HashSet(stopWords.length * 2, 0.3f); 
    stops.addAll(Arrays.asList(stopWords));
    return stops;
//    return Collections.unmodifiableSet(stops);
  }

  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * The work horse; performance isn't fantastic, but it's not nearly as bad
   * as one might think - kudos to the Sun regex developers.
   */
  private static final class PatternTokenizer extends TokenStream {
    
    private final String str;
    private final boolean toLowerCase;
    private Matcher matcher;
    private int pos = 0;
    private static final Locale locale = Locale.getDefault();
    
    public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
      this.str = str;
      this.matcher = pattern.matcher(str);
      this.toLowerCase = toLowerCase;
    }

    public Token next() {
      if (matcher == null) return null;
      
      while (true) { // loop takes care of leading and trailing boundary cases
        int start = pos;
        int end;
        boolean isMatch = matcher.find();
        if (isMatch) {
          end = matcher.start();
          pos = matcher.end();
        } else { 
          end = str.length();
          matcher = null; // we're finished
        }
        
        if (start != end) { // non-empty match (header/trailer)
          String text = str.substring(start, end);
          if (toLowerCase) text = text.toLowerCase(locale);
          return new Token(text, start, end);
        }
        if (!isMatch) return null;
      }
    }
    
  } 
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * Special-case class for best performance in common cases; this class is
   * otherwise unnecessary.
   */
  private static final class FastStringTokenizer extends TokenStream {
    
    private final String str;
    private int pos;
    private final boolean isLetter;
    private final boolean toLowerCase;
    private final Set stopWords;
    private static final Locale locale = Locale.getDefault();
    
    public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
      this.str = str;
      this.isLetter = isLetter;
      this.toLowerCase = toLowerCase;
      this.stopWords = stopWords;
    }

    public Token next() {
      // cache loop instance vars (performance)
      String s = str;
      int len = s.length();
      int i = pos;
      boolean letter = isLetter;
      
      int start = 0;
      String text;
      do {
        // find beginning of token
        text = null;
        while (i < len && !isTokenChar(s.charAt(i), letter)) {
          i++;
        }
        
        if (i < len) { // found beginning; now find end of token
          start = i;
          while (i < len && isTokenChar(s.charAt(i), letter)) {
            i++;
          }
          
          text = s.substring(start, i);
          if (toLowerCase) text = text.toLowerCase(locale);
//          if (toLowerCase) {            
////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
//            text = s.substring(start, i).toLowerCase(); 
////            char[] chars = new char[i-start];
////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
////            text = new String(chars);
//          } else {
//            text = s.substring(start, i);
//          }
        }
      } while (text != null && isStopWord(text));
      
      pos = i;
      return text != null ? new Token(text, start, i) : null;
    }
    
    private boolean isTokenChar(char c, boolean isLetter) {
      return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
    }
    
    private boolean isStopWord(String text) {
      return stopWords != null && stopWords.contains(text);
    }
    
  }

  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * A StringReader that exposes it's contained string for fast direct access.
   * Might make sense to generalize this to CharSequence and make it public?
   */
  static final class FastStringReader extends StringReader {

    private final String s;
    
    FastStringReader(String s) {
      super(s);
      this.s = s;
    }
    
    String getString() {
      return s;
    }
  }
  
}