FileDocCategorySizeDatePackage
RussianStemmer.javaAPI DocApache Lucene 1.4.321428Tue Mar 30 00:48:02 BST 2004org.apache.lucene.analysis.ru

RussianStemmer

public class RussianStemmer extends Object
Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
author
Boris Okner, b.okner@rogers.com
version
$Id: RussianStemmer.java,v 1.5 2004/03/29 22:48:01 cutting Exp $

Fields Summary
private char[]
charset
private int
RV
private int
R1
private int
R2
private static char
A
private static char
B
private static char
V
private static char
G
private static char
D
private static char
E
private static char
ZH
private static char
Z
private static char
I
private static char
I_
private static char
K
private static char
L
private static char
M
private static char
N
private static char
O
private static char
P
private static char
R
private static char
S
private static char
T
private static char
U
private static char
F
private static char
X
private static char
TS
private static char
CH
private static char
SH
private static char
SHCH
private static char
HARD
private static char
Y
private static char
SOFT
private static char
AE
private static char
IU
private static char
IA
private static char[]
vowels
private static char[]
perfectiveGerundEndings1
private static char[]
perfectiveGerund1Predessors
private static char[]
perfectiveGerundEndings2
private static char[]
adjectiveEndings
private static char[]
participleEndings1
private static char[]
participleEndings2
private static char[]
participle1Predessors
private static char[]
reflexiveEndings
private static char[]
verbEndings1
private static char[]
verbEndings2
private static char[]
verb1Predessors
private static char[]
nounEndings
private static char[]
superlativeEndings
private static char[]
derivationalEndings
Constructors Summary
public RussianStemmer()
RussianStemmer constructor comment.


            
     
    
        super();
    
public RussianStemmer(char[] charset)
RussianStemmer constructor comment.

        super();
        this.charset = charset;
    
Methods Summary
private booleanadjectival(java.lang.StringBuffer stemmingZone)
Adjectival ending is an adjective ending, optionally preceded by participle ending. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        // look for adjective ending in a stemming zone
        if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
            return false;
        // if adjective ending was found, try for participle ending
        boolean r =
            findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
            ||
            findAndRemoveEnding(stemmingZone, participleEndings2);
        return true;
    
private booleanderivational(java.lang.StringBuffer stemmingZone)
Derivational endings Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        int endingLength = findEnding(stemmingZone, derivationalEndings);
        if (endingLength == 0)
             // no derivational ending found
            return false;
        else
        {
            // Ensure that the ending locates in R2
            if (R2 - RV <= stemmingZone.length() - endingLength)
            {
                stemmingZone.setLength(stemmingZone.length() - endingLength);
                return true;
            }
            else
            {
                return false;
            }
        }
    
private booleanfindAndRemoveEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass)
Finds the ending among the given class of endings and removes it from stemming zone. Creation date: (17/03/2002 8:18:34 PM)

        int endingLength = findEnding(stemmingZone, theEndingClass);
        if (endingLength == 0)
            // not found
            return false;
        else {
            stemmingZone.setLength(stemmingZone.length() - endingLength);
            // cut the ending found
            return true;
        }
    
private booleanfindAndRemoveEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass, char[][] thePredessors)
Finds the ending among the given class of endings, then checks if this ending was preceded by any of given predessors, and if so, removes it from stemming zone. Creation date: (17/03/2002 8:18:34 PM)

        int endingLength = findEnding(stemmingZone, theEndingClass);
        if (endingLength == 0)
            // not found
            return false;
        else
        {
            int predessorLength =
                findEnding(stemmingZone,
                    stemmingZone.length() - endingLength - 1,
                    thePredessors);
            if (predessorLength == 0)
                return false;
            else {
                stemmingZone.setLength(stemmingZone.length() - endingLength);
                // cut the ending found
                return true;
            }
        }

    
private intfindEnding(java.lang.StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
Finds ending among given ending class and returns the length of ending found(0, if not found). Creation date: (17/03/2002 8:18:34 PM)

        boolean match = false;
        for (int i = theEndingClass.length - 1; i >= 0; i--)
        {
            char[] theEnding = theEndingClass[i];
            // check if the ending is bigger than stemming zone
            if (startIndex < theEnding.length - 1)
            {
                match = false;
                continue;
            }
            match = true;
            int stemmingIndex = startIndex;
            for (int j = theEnding.length - 1; j >= 0; j--)
            {
                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
                {
                    match = false;
                    break;
                }
            }
            // check if ending was found
            if (match)
            {
                return theEndingClass[i].length; // cut ending
            }
        }
        return 0;
    
private intfindEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass)

        return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
    
private booleanisVowel(char letter)
Checks if character is a vowel.. Creation date: (16/03/2002 10:47:03 PM)

return
boolean
param
letter char

        for (int i = 0; i < vowels.length; i++)
        {
            if (letter == charset[vowels[i]])
                return true;
        }
        return false;
    
private voidmarkPositions(java.lang.String word)
Marks positions of RV, R1 and R2 in a given word. Creation date: (16/03/2002 3:40:11 PM)

        RV = 0;
        R1 = 0;
        R2 = 0;
        int i = 0;
        // find RV
        while (word.length() > i && !isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // RV zone is empty
        RV = i;
        // find R1
        while (word.length() > i && isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R1 zone is empty
        R1 = i;
        // find R2
        while (word.length() > i && !isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R2 zone is empty
        while (word.length() > i && isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R2 zone is empty
        R2 = i;
    
private booleannoun(java.lang.StringBuffer stemmingZone)
Noun endings. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        return findAndRemoveEnding(stemmingZone, nounEndings);
    
private booleanperfectiveGerund(java.lang.StringBuffer stemmingZone)
Perfective gerund endings. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        return findAndRemoveEnding(
            stemmingZone,
            perfectiveGerundEndings1,
            perfectiveGerund1Predessors)
            || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
    
private booleanreflexive(java.lang.StringBuffer stemmingZone)
Reflexive endings. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        return findAndRemoveEnding(stemmingZone, reflexiveEndings);
    
private booleanremoveI(java.lang.StringBuffer stemmingZone)
Insert the method's description here. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        if (stemmingZone.length() > 0
            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    
private booleanremoveSoft(java.lang.StringBuffer stemmingZone)
Insert the method's description here. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        if (stemmingZone.length() > 0
            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    
public voidsetCharset(char[] newCharset)
Insert the method's description here. Creation date: (16/03/2002 10:58:42 PM)

param
newCharset char[]

        charset = newCharset;
    
private voidsetEndings()
Set ending definition as in Russian stemming algorithm. Creation date: (16/03/2002 11:16:36 PM)

        vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };

        perfectiveGerundEndings1 = new char[][] {
            { V }, { V, SH, I }, { V, SH, I, S, SOFT }
        };

        perfectiveGerund1Predessors = new char[][] { { A }, { IA }
        };

        perfectiveGerundEndings2 = new char[][] {
            { I, V },
            { Y, V },
            { I, V, SH, I },
            { Y, V, SH, I },
            { I, V, SH, I, S, SOFT },
            { Y, V, SH, I, S, SOFT }
        };

        adjectiveEndings = new char[][] {
            { E, E },
            { I, E },
            { Y, E },
            { O, E },
            { E, I_ },
            { I, I_ },
            { Y, I_ },
            { O, I_ },
            { E, M },
            { I, M },
            { Y, M },
            { O, M },
            { I, X },
            { Y, X },
            { U, IU },
            { IU, IU },
            { A, IA },
            { IA, IA },
            { O, IU },
            { E, IU },
            { I, M, I },
            { Y, M, I },
            { E, G, O },
            { O, G, O },
            { E, M, U },
            { O, M, U }
        };

        participleEndings1 = new char[][] {
            { SHCH },
            { E, M },
            { N, N },
            { V, SH },
            { IU, SHCH }
        };

        participleEndings2 = new char[][] {
            { I, V, SH },
            { Y, V, SH },
            { U, IU, SHCH }
        };

        participle1Predessors = new char[][] {
            { A },
            { IA }
        };

        reflexiveEndings = new char[][] {
            { S, IA },
            { S, SOFT }
        };

        verbEndings1 = new char[][] {
            { I_ },
            { L },
            { N },
            { L, O },
            { N, O },
            { E, T },
            { IU, T },
            { L, A },
            { N, A },
            { L, I },
            { E, M },
            { N, Y },
            { E, T, E },
            { I_, T, E },
            { T, SOFT },
            { E, SH, SOFT },
            { N, N, O }
        };

        verbEndings2 = new char[][] {
            { IU },
            { U, IU },
            { E, N },
            { E, I_ },
            { IA, T },
            { U, I_ },
            { I, L },
            { Y, L },
            { I, M },
            { Y, M },
            { I, T },
            { Y, T },
            { I, L, A },
            { Y, L, A },
            { E, N, A },
            { I, T, E },
            { I, L, I },
            { Y, L, I },
            { I, L, O },
            { Y, L, O },
            { E, N, O },
            { U, E, T },
            { U, IU, T },
            { E, N, Y },
            { I, T, SOFT },
            { Y, T, SOFT },
            { I, SH, SOFT },
            { E, I_, T, E },
            { U, I_, T, E }
        };

        verb1Predessors = new char[][] {
            { A },
            { IA }
        };

        nounEndings = new char[][] {
            { A },
            { IU },
            { I_ },
            { O },
            { U },
            { E },
            { Y },
            { I },
            { SOFT },
            { IA },
            { E, V },
            { O, V },
            { I, E },
            { SOFT, E },
            { IA, X },
            { I, IU },
            { E, I },
            { I, I },
            { E, I_ },
            { O, I_ },
            { E, M },
            { A, M },
            { O, M },
            { A, X },
            { SOFT, IU },
            { I, IA },
            { SOFT, IA },
            { I, I_ },
            { IA, M },
            { IA, M, I },
            { A, M, I },
            { I, E, I_ },
            { I, IA, M },
            { I, E, M },
            { I, IA, X },
            { I, IA, M, I }
        };

        superlativeEndings = new char[][] {
            { E, I_, SH },
            { E, I_, SH, E }
        };

        derivationalEndings = new char[][] {
            { O, S, T },
            { O, S, T, SOFT }
        };
    
public java.lang.Stringstem(java.lang.String input)
Finds the stem for given Russian word. Creation date: (16/03/2002 3:36:48 PM)

return
java.lang.String
param
input java.lang.String

        markPositions(input);
        if (RV == 0)
            return input; //RV wasn't detected, nothing to stem
        StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
        // stemming goes on in RV
        // Step 1

        if (!perfectiveGerund(stemmingZone))
        {
            reflexive(stemmingZone);
            boolean r =
                adjectival(stemmingZone)
                || verb(stemmingZone)
                || noun(stemmingZone);
        }
        // Step 2
        removeI(stemmingZone);
        // Step 3
        derivational(stemmingZone);
        // Step 4
        superlative(stemmingZone);
        undoubleN(stemmingZone);
        removeSoft(stemmingZone);
        // return result
        return input.substring(0, RV) + stemmingZone.toString();
    
public static java.lang.Stringstem(java.lang.String theWord, char[] charset)
Static method for stemming with different charsets

        RussianStemmer stemmer = new RussianStemmer();
        stemmer.setCharset(charset);
        return stemmer.stem(theWord);
    
private booleansuperlative(java.lang.StringBuffer stemmingZone)
Superlative endings. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        return findAndRemoveEnding(stemmingZone, superlativeEndings);
    
private booleanundoubleN(java.lang.StringBuffer stemmingZone)
Undoubles N. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        char[][] doubleN = {
            { N, N }
        };
        if (findEnding(stemmingZone, doubleN) != 0)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    
private booleanverb(java.lang.StringBuffer stemmingZone)
Verb endings. Creation date: (17/03/2002 12:14:58 AM)

param
stemmingZone java.lang.StringBuffer

        return findAndRemoveEnding(
            stemmingZone,
            verbEndings1,
            verb1Predessors)
            || findAndRemoveEnding(stemmingZone, verbEndings2);