FileDocCategorySizeDatePackage
NGramTokenFilter.javaAPI DocApache Lucene 2.2.03257Sat Jun 16 22:21:08 BST 2007org.apache.lucene.analysis.ngram

NGramTokenFilter

public class NGramTokenFilter extends TokenFilter
Tokenizes the input into n-grams of the given size(s).
author
Otis Gospodnetic

Fields Summary
public static final int
DEFAULT_MIN_NGRAM_SIZE
public static final int
DEFAULT_MAX_NGRAM_SIZE
private int
minGram
private int
maxGram
private LinkedList
ngrams
Constructors Summary
public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
Creates NGramTokenFilter with given min and max n-grams.

param
input TokenStream holding the input to be tokenized
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate


                                    
         
    super(input);
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
    if (minGram > maxGram) {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
    this.ngrams = new LinkedList();
  
public NGramTokenFilter(TokenStream input)
Creates NGramTokenFilter with default min and max n-grams.

param
input TokenStream holding the input to be tokenized

    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
  
Methods Summary
public final org.apache.lucene.analysis.Tokennext()
Returns the next token in the stream, or null at EOS.

    if (ngrams.size() > 0) {
      return (Token) ngrams.removeFirst();
    }

    Token token = input.next();
    if (token == null) {
      return null;
    }

    ngram(token);
    if (ngrams.size() > 0)
      return (Token) ngrams.removeFirst();
    else
      return null;
  
private voidngram(org.apache.lucene.analysis.Token token)

 
    String inStr = token.termText();
    int inLen = inStr.length();
    int gramSize = minGram;
    while (gramSize <= maxGram) {
      int pos = 0;                        // reset to beginning of string
      while (pos+gramSize <= inLen) {     // while there is input
        String gram = inStr.substring(pos, pos+gramSize);
        Token tok = new Token(gram, pos, pos+gramSize);
//        tok.setPositionIncrement(pos);
        ngrams.add(tok);
        pos++;
      }
      gramSize++;                         // increase n-gram size
    }