package org.apache.lucene.analysis.ru;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
import java.util.Hashtable;
import java.util.Set;
import java.util.HashSet;
/**
* Analyzer for Russian language. Supports an external list of stopwords (words that
* will not be indexed at all).
* A default set of stopwords is used unless an alternative list is specified.
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id: RussianAnalyzer.java 472959 2006-11-09 16:21:50Z yonik $
*/
public final class RussianAnalyzer extends Analyzer
{
// letters (currently unused letters are commented out)
private final static char A = 0;
private final static char B = 1;
private final static char V = 2;
private final static char G = 3;
private final static char D = 4;
private final static char E = 5;
private final static char ZH = 6;
private final static char Z = 7;
private final static char I = 8;
private final static char I_ = 9;
private final static char K = 10;
private final static char L = 11;
private final static char M = 12;
private final static char N = 13;
private final static char O = 14;
private final static char P = 15;
private final static char R = 16;
private final static char S = 17;
private final static char T = 18;
private final static char U = 19;
//private final static char F = 20;
private final static char X = 21;
//private final static char TS = 22;
private final static char CH = 23;
private final static char SH = 24;
private final static char SHCH = 25;
//private final static char HARD = 26;
private final static char Y = 27;
private final static char SOFT = 28;
private final static char AE = 29;
private final static char IU = 30;
private final static char IA = 31;
/**
* List of typical Russian stopwords.
*/
private static char[][] RUSSIAN_STOP_WORDS = {
{A},
{B, E, Z},
{B, O, L, E, E},
{B, Y},
{B, Y, L},
{B, Y, L, A},
{B, Y, L, I},
{B, Y, L, O},
{B, Y, T, SOFT},
{V},
{V, A, M},
{V, A, S},
{V, E, S, SOFT},
{V, O},
{V, O, T},
{V, S, E},
{V, S, E, G, O},
{V, S, E, X},
{V, Y},
{G, D, E},
{D, A},
{D, A, ZH, E},
{D, L, IA},
{D, O},
{E, G, O},
{E, E},
{E, I_,},
{E, IU},
{E, S, L, I},
{E, S, T, SOFT},
{E, SHCH, E},
{ZH, E},
{Z, A},
{Z, D, E, S, SOFT},
{I},
{I, Z},
{I, L, I},
{I, M},
{I, X},
{K},
{K, A, K},
{K, O},
{K, O, G, D, A},
{K, T, O},
{L, I},
{L, I, B, O},
{M, N, E},
{M, O, ZH, E, T},
{M, Y},
{N, A},
{N, A, D, O},
{N, A, SH},
{N, E},
{N, E, G, O},
{N, E, E},
{N, E, T},
{N, I},
{N, I, X},
{N, O},
{N, U},
{O},
{O, B},
{O, D, N, A, K, O},
{O, N},
{O, N, A},
{O, N, I},
{O, N, O},
{O, T},
{O, CH, E, N, SOFT},
{P, O},
{P, O, D},
{P, R, I},
{S},
{S, O},
{T, A, K},
{T, A, K, ZH, E},
{T, A, K, O, I_},
{T, A, M},
{T, E},
{T, E, M},
{T, O},
{T, O, G, O},
{T, O, ZH, E},
{T, O, I_},
{T, O, L, SOFT, K, O},
{T, O, M},
{T, Y},
{U},
{U, ZH, E},
{X, O, T, IA},
{CH, E, G, O},
{CH, E, I_},
{CH, E, M},
{CH, T, O},
{CH, T, O, B, Y},
{CH, SOFT, E},
{CH, SOFT, IA},
{AE, T, A},
{AE, T, I},
{AE, T, O},
{IA}
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stopSet = new HashSet();
/**
* Charset for Russian letters.
* Represents encoding for 32 lowercase Russian letters.
* Predefined charsets can be taken from RussianCharSets class
*/
private char[] charset;
public RussianAnalyzer() {
charset = RussianCharsets.UnicodeRussian;
stopSet = StopFilter.makeStopSet(
makeStopWords(RussianCharsets.UnicodeRussian));
}
/**
* Builds an analyzer.
*/
public RussianAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
}
/**
* Builds an analyzer with the given stop words.
*/
public RussianAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
}
// Takes russian stop words and translates them to a String array, using
// the given charset
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[RUSSIAN_STOP_WORDS.length];
for (int i = 0; i < res.length; i++)
{
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
// translate the word, using the charset
StringBuffer theWord = new StringBuffer();
for (int j = 0; j < theStopWord.length; j++)
{
theWord.append(charset[theStopWord[j]]);
}
res[i] = theWord.toString();
}
return res;
}
/**
* Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
*/
public RussianAnalyzer(char[] charset, Hashtable stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a RussianLetterTokenizer filtered with
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
*/
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new RussianLetterTokenizer(reader, charset);
result = new RussianLowerCaseFilter(result, charset);
result = new StopFilter(result, stopSet);
result = new RussianStemFilter(result, charset);
return result;
}
}
|