FileDocCategorySizeDatePackage
ChineseTokenizer.javaAPI DocApache Lucene 1.93554Mon Feb 20 09:18:48 GMT 2006org.apache.lucene.analysis.cn

ChineseTokenizer

public final class ChineseTokenizer extends Tokenizer
Title: ChineseTokenizer Description: Extract tokens from the Stream using Character.getType() Rule: A Chinese character as a single token Copyright: Copyright (c) 2001 Company: The difference between thr ChineseTokenizer and the CJKTokenizer (id=23545) is that they have different token parsing logic. Let me use an example. If having a Chinese text "C1C2C3C4" to be indexed, the tokens returned from the ChineseTokenizer are C1, C2, C3, C4. And the tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4. Therefore the index the CJKTokenizer created is much larger. The problem is that when searching for C1, C1C2, C1C3, C4C2, C1C2C3 ... the ChineseTokenizer works, but the CJKTokenizer will not work.
author
Yiyi Sun
version
1.0

Fields Summary
private int
offset
private int
bufferIndex
private int
dataLen
private static final int
MAX_WORD_LEN
private static final int
IO_BUFFER_SIZE
private final char[]
buffer
private final char[]
ioBuffer
private int
length
private int
start
Constructors Summary
public ChineseTokenizer(Reader in)

        input = in;
    
Methods Summary
private final org.apache.lucene.analysis.Tokenflush()


        if (length>0) {
            //System.out.println(new String(buffer, 0, length));
            return new Token(new String(buffer, 0, length), start, start+length);
        }
        else
            return null;
    
public final org.apache.lucene.analysis.Tokennext()


        length = 0;
        start = offset;


        while (true) {

            final char c;
            offset++;

            if (bufferIndex >= dataLen) {
                dataLen = input.read(ioBuffer);
                bufferIndex = 0;
            };

            if (dataLen == -1) return flush();
            else
                c = ioBuffer[bufferIndex++];


            switch(Character.getType(c)) {

            case Character.DECIMAL_DIGIT_NUMBER:
            case Character.LOWERCASE_LETTER:
            case Character.UPPERCASE_LETTER:
                push(c);
                if (length == MAX_WORD_LEN) return flush();
                break;

            case Character.OTHER_LETTER:
                if (length>0) {
                    bufferIndex--;
                    offset--;
                    return flush();
                }
                push(c);
                return flush();

            default:
                if (length>0) return flush();
                break;
            }
        }

    
private final voidpush(char c)



         

        if (length == 0) start = offset-1;            // start of token
        buffer[length++] = Character.toLowerCase(c);  // buffer it