FileDocCategorySizeDatePackage
EdgeNGramTokenFilter.javaAPI DocApache Lucene 2.2.04934Sat Jun 16 22:21:08 BST 2007org.apache.lucene.analysis.ngram

EdgeNGramTokenFilter

public class EdgeNGramTokenFilter extends TokenFilter
Tokenizes the given token into n-grams of given size(s).
author
Otis Gospodnetic

Fields Summary
public static final Side
DEFAULT_SIDE
public static final int
DEFAULT_MAX_GRAM_SIZE
public static final int
DEFAULT_MIN_GRAM_SIZE
private int
minGram
private int
maxGram
private Side
side
private LinkedList
ngrams
Constructors Summary
protected EdgeNGramTokenFilter(TokenStream input)

    super(input);
    this.ngrams = new LinkedList();
  
public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range

param
input TokenStream holding the input to be tokenized
param
side the {@link Side} from which to chop off an n-gram
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate

    super(input);

    if (side == null) {
      throw new IllegalArgumentException("sideLabel must be either front or back");
    }

    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }

    if (minGram > maxGram) {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }

    this.minGram = minGram;
    this.maxGram = maxGram;
    this.side = side;
    this.ngrams = new LinkedList();
  
public EdgeNGramTokenFilter(TokenStream input, String sideLabel, int minGram, int maxGram)
Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range

param
input TokenStream holding the input to be tokenized
param
sideLabel the name of the {@link Side} from which to chop off an n-gram
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate

    this(input, Side.getSide(sideLabel), minGram, maxGram);
  
Methods Summary
public final org.apache.lucene.analysis.Tokennext()
Returns the next token in the stream, or null at EOS.

    if (ngrams.size() > 0) {
      return (Token) ngrams.removeFirst();
    }

    Token token = input.next();
    if (token == null) {
      return null;
    }

    ngram(token);
    if (ngrams.size() > 0)
      return (Token) ngrams.removeFirst();
    else
      return null;
  
private voidngram(org.apache.lucene.analysis.Token token)

    String inStr = token.termText();
    int inLen = inStr.length();
    int gramSize = minGram;
    while (gramSize <= maxGram) {
      // if the remaining input is too short, we can't generate any n-grams
      if (gramSize > inLen) {
        return;
      }

      // if we have hit the end of our n-gram size range, quit
      if (gramSize > maxGram) {
        return;
      }

      Token tok;
      if (side == Side.FRONT) {
        tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
      }
      else {
        tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
      }
      ngrams.add(tok);
      gramSize++;
    }