FileDocCategorySizeDatePackage
RussianAnalyzer.javaAPI DocApache Lucene 1.4.37164Tue Mar 30 00:48:02 BST 2004org.apache.lucene.analysis.ru

RussianAnalyzer

public final class RussianAnalyzer extends Analyzer
Analyzer for Russian language. Supports an external list of stopwords (words that will not be indexed at all). A default set of stopwords is used unless an alternative list is specified.
author
Boris Okner, b.okner@rogers.com
version
$Id: RussianAnalyzer.java,v 1.7 2004/03/29 22:48:01 cutting Exp $

Fields Summary
private static char
A
private static char
B
private static char
V
private static char
G
private static char
D
private static char
E
private static char
ZH
private static char
Z
private static char
I
private static char
I_
private static char
K
private static char
L
private static char
M
private static char
N
private static char
O
private static char
P
private static char
R
private static char
S
private static char
T
private static char
U
private static char
F
private static char
X
private static char
TS
private static char
CH
private static char
SH
private static char
SHCH
private static char
HARD
private static char
Y
private static char
SOFT
private static char
AE
private static char
IU
private static char
IA
private static char[]
RUSSIAN_STOP_WORDS
List of typical Russian stopwords.
private Set
stopSet
Contains the stopwords used with the StopFilter.
private char[]
charset
Charset for Russian letters. Represents encoding for 32 lowercase Russian letters. Predefined charsets can be taken from RussianCharSets class
Constructors Summary
public RussianAnalyzer()



      
        charset = RussianCharsets.UnicodeRussian;
        stopSet = StopFilter.makeStopSet(
                    makeStopWords(RussianCharsets.UnicodeRussian));
    
public RussianAnalyzer(char[] charset)
Builds an analyzer.

        this.charset = charset;
        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
    
public RussianAnalyzer(char[] charset, String[] stopwords)
Builds an analyzer with the given stop words.

        this.charset = charset;
        stopSet = StopFilter.makeStopSet(stopwords);
    
public RussianAnalyzer(char[] charset, Hashtable stopwords)
Builds an analyzer with the given stop words.

todo
create a Set version of this ctor

        this.charset = charset;
        stopSet = new HashSet(stopwords.keySet());
    
Methods Summary
private static java.lang.String[]makeStopWords(char[] charset)

        String[] res = new String[RUSSIAN_STOP_WORDS.length];
        for (int i = 0; i < res.length; i++)
        {
            char[] theStopWord = RUSSIAN_STOP_WORDS[i];
            // translate the word,using the charset
            StringBuffer theWord = new StringBuffer();
            for (int j = 0; j < theStopWord.length; j++)
            {
                theWord.append(charset[theStopWord[j]]);
            }
            res[i] = theWord.toString();
        }
        return res;
    
public org.apache.lucene.analysis.TokenStreamtokenStream(java.lang.String fieldName, java.io.Reader reader)
Creates a TokenStream which tokenizes all the text in the provided Reader.

return
A TokenStream build from a RussianLetterTokenizer filtered with RussianLowerCaseFilter, StopFilter, and RussianStemFilter

        TokenStream result = new RussianLetterTokenizer(reader, charset);
        result = new RussianLowerCaseFilter(result, charset);
        result = new StopFilter(result, stopSet);
        result = new RussianStemFilter(result, charset);
        return result;