FileDocCategorySizeDatePackage
EdgeNGramTokenizer.javaAPI DocApache Lucene 2.2.04617Sat Jun 16 22:21:08 BST 2007org.apache.lucene.analysis.ngram

EdgeNGramTokenizer

public class EdgeNGramTokenizer extends Tokenizer
Tokenizes the input from an edge into n-grams of given size(s).
author
Otis Gospodnetic
author
Adam Hiatt

Fields Summary
public static final Side
DEFAULT_SIDE
public static final int
DEFAULT_MAX_GRAM_SIZE
public static final int
DEFAULT_MIN_GRAM_SIZE
private int
minGram
private int
maxGram
private int
gramSize
private Side
side
private boolean
started
private int
inLen
private String
inStr
Constructors Summary
public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram)
Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range

param
input Reader holding the input to be tokenized
param
side the {@link Side} from which to chop off an n-gram
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate



                                                     
           
    super(input);

    if (side == null) {
      throw new IllegalArgumentException("sideLabel must be either front or back");
    }

    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }

    if (minGram > maxGram) {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }

    this.minGram = minGram;
    this.maxGram = maxGram;
    this.side = side;
  
public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram)
Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range

param
input Reader holding the input to be tokenized
param
sideLabel the name of the {@link Side} from which to chop off an n-gram
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate

    this(input, Side.getSide(sideLabel), minGram, maxGram);
  
Methods Summary
public final org.apache.lucene.analysis.Tokennext()
Returns the next token in the stream, or null at EOS.

    // if we are just starting, read the whole input
    if (!started) {
      started = true;
      char[] chars = new char[1024];
      input.read(chars);
      inStr = new String(chars).trim();  // remove any trailing empty strings
      inLen = inStr.length();
      gramSize = minGram;
    }

    // if the remaining input is too short, we can't generate any n-grams
    if (gramSize > inLen) {
      return null;
    }

    // if we have hit the end of our n-gram size range, quit
    if (gramSize > maxGram) {
      return null;
    }

    Token tok;
    if (side == Side.FRONT) {
      tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
    }
    else {
      tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
    }

    gramSize++;
    return tok;