FileDocCategorySizeDatePackage
NGramTokenizer.javaAPI DocApache Lucene 2.2.03103Sat Jun 16 22:21:08 BST 2007org.apache.lucene.analysis.ngram

NGramTokenizer

public class NGramTokenizer extends Tokenizer
Tokenizes the input into n-grams of the given size(s).
author
Otis Gospodnetic

Fields Summary
public static final int
DEFAULT_MIN_NGRAM_SIZE
public static final int
DEFAULT_MAX_NGRAM_SIZE
private int
minGram
private int
maxGram
private int
gramSize
private int
pos
private int
inLen
private String
inStr
private boolean
started
Constructors Summary
public NGramTokenizer(Reader input, int minGram, int maxGram)
Creates NGramTokenizer with given min and max n-grams.

param
input Reader holding the input to be tokenized
param
minGram the smallest n-gram to generate
param
maxGram the largest n-gram to generate


                                    
         
    super(input);
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
    if (minGram > maxGram) {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
  
public NGramTokenizer(Reader input)
Creates NGramTokenizer with default min and max n-grams.

param
input Reader holding the input to be tokenized

    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
  
Methods Summary
public final org.apache.lucene.analysis.Tokennext()
Returns the next token in the stream, or null at EOS.

    if (!started) {
      started = true;
      gramSize = minGram;
      char[] chars = new char[1024];
      input.read(chars);
      inStr = new String(chars).trim();  // remove any trailing empty strings 
      inLen = inStr.length();
    }

    if (pos+gramSize > inLen) {            // if we hit the end of the string
      pos = 0;                           // reset to beginning of string
      gramSize++;                        // increase n-gram size
      if (gramSize > maxGram)            // we are done
        return null;
      if (pos+gramSize > inLen)
        return null;
    }
    String gram = inStr.substring(pos, pos+gramSize);
    int oldPos = pos;
    pos++;
    return new Token(gram, oldPos, oldPos+gramSize);