FileDocCategorySizeDatePackage
ThaiWordFilter.javaAPI DocApache Lucene 2.1.02437Wed Feb 14 10:46:32 GMT 2007org.apache.lucene.analysis.th

ThaiWordFilter

public class ThaiWordFilter extends TokenFilter
TokenFilter that use java.text.BreakIterator to break each Token that is Thai into separate Token(s) for each Thai word.
author
Samphan Raruenrom for To-Be-One Technology Co., Ltd.
version
0.2

Fields Summary
private BreakIterator
breaker
private Token
thaiToken
Constructors Summary
public ThaiWordFilter(TokenStream input)

  
     
    super(input);
    breaker = BreakIterator.getWordInstance(new Locale("th"));
  
Methods Summary
public org.apache.lucene.analysis.Tokennext()

    if (thaiToken != null) {
      String text = thaiToken.termText();
      int start = breaker.current();
      int end = breaker.next();
      if (end != BreakIterator.DONE) {
        return new Token(text.substring(start, end), 
            thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type());
      }
      thaiToken = null;
    }
    Token tk = input.next();
    if (tk == null) {
      return null;
    }
    String text = tk.termText();
    if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
      return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type());
    }
    thaiToken = tk;
    breaker.setText(text);
    int end = breaker.next();
    if (end != BreakIterator.DONE) {
      return new Token(text.substring(0, end), 
          thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type());
    }
    return null;