File Doc Category Size Date Package
RussianStemmer.java API Doc Apache Lucene 1.4.3 21428 Tue Mar 30 00:48:02 BST 2004 org.apache.lucene.analysis.ru

RussianStemmer

java.lang.Object

public class RussianStemmer extends Object

Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).

author: Boris Okner, b.okner@rogers.com
version: $Id: RussianStemmer.java,v 1.5 2004/03/29 22:48:01 cutting Exp $

Fields Summary
private char[]
charset
private int
RV
private int
R1
private int
R2
private static char
A
private static char
B
private static char
V
private static char
G
private static char
D
private static char
E
private static char
ZH
private static char
Z
private static char
I
private static char
I_
private static char
K
private static char
L
private static char
M
private static char
N
private static char
O
private static char
P
private static char
R
private static char
S
private static char
T
private static char
U
private static char
F
private static char
X
private static char
TS
private static char
CH
private static char
SH
private static char
SHCH
private static char
HARD
private static char
Y
private static char
SOFT
private static char
AE
private static char
IU
private static char
IA
private static char[]
vowels
private static char[]
perfectiveGerundEndings1
private static char[]
perfectiveGerund1Predessors
private static char[]
perfectiveGerundEndings2
private static char[]
adjectiveEndings
private static char[]
participleEndings1
private static char[]
participleEndings2
private static char[]
participle1Predessors
private static char[]
reflexiveEndings
private static char[]
verbEndings1
private static char[]
verbEndings2
private static char[]
verb1Predessors
private static char[]
nounEndings
private static char[]
superlativeEndings
private static char[]
derivationalEndings
Constructors Summary
public RussianStemmer()
RussianStemmer constructor comment.
super();
public RussianStemmer(char[] charset)
RussianStemmer constructor comment.
super(); this.charset = charset;
Methods Summary
private boolean adjectival(java.lang.StringBuffer stemmingZone)
Adjectival ending is an adjective ending, optionally preceded by participle ending. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
// look for adjective ending in a stemming zone if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) return false; // if adjective ending was found, try for participle ending boolean r = findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) || findAndRemoveEnding(stemmingZone, participleEndings2); return true;
private boolean derivational(java.lang.StringBuffer stemmingZone)
Derivational endings Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
int endingLength = findEnding(stemmingZone, derivationalEndings); if (endingLength == 0) // no derivational ending found return false; else { // Ensure that the ending locates in R2 if (R2 - RV <= stemmingZone.length() - endingLength) { stemmingZone.setLength(stemmingZone.length() - endingLength); return true; } else { return false; } }
private boolean findAndRemoveEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass)
Finds the ending among the given class of endings and removes it from stemming zone. Creation date: (17/03/2002 8:18:34 PM)
int endingLength = findEnding(stemmingZone, theEndingClass); if (endingLength == 0) // not found return false; else { stemmingZone.setLength(stemmingZone.length() - endingLength); // cut the ending found return true; }
private boolean findAndRemoveEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass, char[][] thePredessors)
Finds the ending among the given class of endings, then checks if this ending was preceded by any of given predessors, and if so, removes it from stemming zone. Creation date: (17/03/2002 8:18:34 PM)
int endingLength = findEnding(stemmingZone, theEndingClass); if (endingLength == 0) // not found return false; else { int predessorLength = findEnding(stemmingZone, stemmingZone.length() - endingLength - 1, thePredessors); if (predessorLength == 0) return false; else { stemmingZone.setLength(stemmingZone.length() - endingLength); // cut the ending found return true; } }
private int findEnding(java.lang.StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
Finds ending among given ending class and returns the length of ending found(0, if not found). Creation date: (17/03/2002 8:18:34 PM)
boolean match = false; for (int i = theEndingClass.length - 1; i >= 0; i--) { char[] theEnding = theEndingClass[i]; // check if the ending is bigger than stemming zone if (startIndex < theEnding.length - 1) { match = false; continue; } match = true; int stemmingIndex = startIndex; for (int j = theEnding.length - 1; j >= 0; j--) { if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) { match = false; break; } } // check if ending was found if (match) { return theEndingClass[i].length; // cut ending } } return 0;
private int findEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass)
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
private boolean isVowel(char letter)
Checks if character is a vowel.. Creation date: (16/03/2002 10:47:03 PM)
return
boolean
param
letter char
for (int i = 0; i < vowels.length; i++) { if (letter == charset[vowels[i]]) return true; } return false;
private void markPositions(java.lang.String word)
Marks positions of RV, R1 and R2 in a given word. Creation date: (16/03/2002 3:40:11 PM)
RV = 0; R1 = 0; R2 = 0; int i = 0; // find RV while (word.length() > i && !isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // RV zone is empty RV = i; // find R1 while (word.length() > i && isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R1 zone is empty R1 = i; // find R2 while (word.length() > i && !isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R2 zone is empty while (word.length() > i && isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R2 zone is empty R2 = i;
private boolean noun(java.lang.StringBuffer stemmingZone)
Noun endings. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
return findAndRemoveEnding(stemmingZone, nounEndings);
private boolean perfectiveGerund(java.lang.StringBuffer stemmingZone)
Perfective gerund endings. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
return findAndRemoveEnding( stemmingZone, perfectiveGerundEndings1, perfectiveGerund1Predessors) || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
private boolean reflexive(java.lang.StringBuffer stemmingZone)
Reflexive endings. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
private boolean removeI(java.lang.StringBuffer stemmingZone)
Insert the method's description here. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
if (stemmingZone.length() > 0 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; }
private boolean removeSoft(java.lang.StringBuffer stemmingZone)
Insert the method's description here. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
if (stemmingZone.length() > 0 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; }
public void setCharset(char[] newCharset)
Insert the method's description here. Creation date: (16/03/2002 10:58:42 PM)
param
newCharset char[]
charset = newCharset;
private void setEndings()
Set ending definition as in Russian stemming algorithm. Creation date: (16/03/2002 11:16:36 PM)
vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA }; perfectiveGerundEndings1 = new char[][] { { V }, { V, SH, I }, { V, SH, I, S, SOFT } }; perfectiveGerund1Predessors = new char[][] { { A }, { IA } }; perfectiveGerundEndings2 = new char[][] { { I, V }, { Y, V }, { I, V, SH, I }, { Y, V, SH, I }, { I, V, SH, I, S, SOFT }, { Y, V, SH, I, S, SOFT } }; adjectiveEndings = new char[][] { { E, E }, { I, E }, { Y, E }, { O, E }, { E, I_ }, { I, I_ }, { Y, I_ }, { O, I_ }, { E, M }, { I, M }, { Y, M }, { O, M }, { I, X }, { Y, X }, { U, IU }, { IU, IU }, { A, IA }, { IA, IA }, { O, IU }, { E, IU }, { I, M, I }, { Y, M, I }, { E, G, O }, { O, G, O }, { E, M, U }, { O, M, U } }; participleEndings1 = new char[][] { { SHCH }, { E, M }, { N, N }, { V, SH }, { IU, SHCH } }; participleEndings2 = new char[][] { { I, V, SH }, { Y, V, SH }, { U, IU, SHCH } }; participle1Predessors = new char[][] { { A }, { IA } }; reflexiveEndings = new char[][] { { S, IA }, { S, SOFT } }; verbEndings1 = new char[][] { { I_ }, { L }, { N }, { L, O }, { N, O }, { E, T }, { IU, T }, { L, A }, { N, A }, { L, I }, { E, M }, { N, Y }, { E, T, E }, { I_, T, E }, { T, SOFT }, { E, SH, SOFT }, { N, N, O } }; verbEndings2 = new char[][] { { IU }, { U, IU }, { E, N }, { E, I_ }, { IA, T }, { U, I_ }, { I, L }, { Y, L }, { I, M }, { Y, M }, { I, T }, { Y, T }, { I, L, A }, { Y, L, A }, { E, N, A }, { I, T, E }, { I, L, I }, { Y, L, I }, { I, L, O }, { Y, L, O }, { E, N, O }, { U, E, T }, { U, IU, T }, { E, N, Y }, { I, T, SOFT }, { Y, T, SOFT }, { I, SH, SOFT }, { E, I_, T, E }, { U, I_, T, E } }; verb1Predessors = new char[][] { { A }, { IA } }; nounEndings = new char[][] { { A }, { IU }, { I_ }, { O }, { U }, { E }, { Y }, { I }, { SOFT }, { IA }, { E, V }, { O, V }, { I, E }, { SOFT, E }, { IA, X }, { I, IU }, { E, I }, { I, I }, { E, I_ }, { O, I_ }, { E, M }, { A, M }, { O, M }, { A, X }, { SOFT, IU }, { I, IA }, { SOFT, IA }, { I, I_ }, { IA, M }, { IA, M, I }, { A, M, I }, { I, E, I_ }, { I, IA, M }, { I, E, M }, { I, IA, X }, { I, IA, M, I } }; superlativeEndings = new char[][] { { E, I_, SH }, { E, I_, SH, E } }; derivationalEndings = new char[][] { { O, S, T }, { O, S, T, SOFT } };
public java.lang.String stem(java.lang.String input)
Finds the stem for given Russian word. Creation date: (16/03/2002 3:36:48 PM)
return
java.lang.String
param
input java.lang.String
markPositions(input); if (RV == 0) return input; //RV wasn't detected, nothing to stem StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); // stemming goes on in RV // Step 1 if (!perfectiveGerund(stemmingZone)) { reflexive(stemmingZone); boolean r = adjectival(stemmingZone) || verb(stemmingZone) || noun(stemmingZone); } // Step 2 removeI(stemmingZone); // Step 3 derivational(stemmingZone); // Step 4 superlative(stemmingZone); undoubleN(stemmingZone); removeSoft(stemmingZone); // return result return input.substring(0, RV) + stemmingZone.toString();
public static java.lang.String stem(java.lang.String theWord, char[] charset)
Static method for stemming with different charsets
RussianStemmer stemmer = new RussianStemmer(); stemmer.setCharset(charset); return stemmer.stem(theWord);
private boolean superlative(java.lang.StringBuffer stemmingZone)
Superlative endings. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
return findAndRemoveEnding(stemmingZone, superlativeEndings);
private boolean undoubleN(java.lang.StringBuffer stemmingZone)
Undoubles N. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
char[][] doubleN = { { N, N } }; if (findEnding(stemmingZone, doubleN) != 0) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; }
private boolean verb(java.lang.StringBuffer stemmingZone)
Verb endings. Creation date: (17/03/2002 12:14:58 AM)
param
stemmingZone java.lang.StringBuffer
return findAndRemoveEnding( stemmingZone, verbEndings1, verb1Predessors) || findAndRemoveEnding(stemmingZone, verbEndings2);