for (Token token = input.next(); token != null; token = input.next()) {
String text = token.termText();
// why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable.get(text) == null) {
switch (Character.getType(text.charAt(0))) {
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
// English word/token should larger than 1 character.
if (text.length()>1) {
return token;
}
break;
case Character.OTHER_LETTER:
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return token;
}
}
}
return null;