RussianStemmerpublic class RussianStemmer extends Object Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). |
Fields Summary |
---|
private char[] | charset | private int | RV | private int | R1 | private int | R2 | private static char | A | private static char | B | private static char | V | private static char | G | private static char | D | private static char | E | private static char | ZH | private static char | Z | private static char | I | private static char | I_ | private static char | K | private static char | L | private static char | M | private static char | N | private static char | O | private static char | P | private static char | R | private static char | S | private static char | T | private static char | U | private static char | F | private static char | X | private static char | TS | private static char | CH | private static char | SH | private static char | SHCH | private static char | HARD | private static char | Y | private static char | SOFT | private static char | AE | private static char | IU | private static char | IA | private static char[] | vowels | private static char[] | perfectiveGerundEndings1 | private static char[] | perfectiveGerund1Predessors | private static char[] | perfectiveGerundEndings2 | private static char[] | adjectiveEndings | private static char[] | participleEndings1 | private static char[] | participleEndings2 | private static char[] | participle1Predessors | private static char[] | reflexiveEndings | private static char[] | verbEndings1 | private static char[] | verbEndings2 | private static char[] | verb1Predessors | private static char[] | nounEndings | private static char[] | superlativeEndings | private static char[] | derivationalEndings |
Constructors Summary |
---|
public RussianStemmer()RussianStemmer constructor comment.
super();
| public RussianStemmer(char[] charset)RussianStemmer constructor comment.
super();
this.charset = charset;
|
Methods Summary |
---|
private boolean | adjectival(java.lang.StringBuffer stemmingZone)Adjectival ending is an adjective ending,
optionally preceded by participle ending.
Creation date: (17/03/2002 12:14:58 AM)
// look for adjective ending in a stemming zone
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending
boolean r =
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
||
findAndRemoveEnding(stemmingZone, participleEndings2);
return true;
| private boolean | derivational(java.lang.StringBuffer stemmingZone)Derivational endings
Creation date: (17/03/2002 12:14:58 AM)
int endingLength = findEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.length() - endingLength)
{
stemmingZone.setLength(stemmingZone.length() - endingLength);
return true;
}
else
{
return false;
}
}
| private boolean | findAndRemoveEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass)Finds the ending among the given class of endings and removes it from stemming zone.
Creation date: (17/03/2002 8:18:34 PM)
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
| private boolean | findAndRemoveEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass, char[][] thePredessors)Finds the ending among the given class of endings, then checks if this ending was
preceded by any of given predessors, and if so, removes it from stemming zone.
Creation date: (17/03/2002 8:18:34 PM)
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength =
findEnding(stemmingZone,
stemmingZone.length() - endingLength - 1,
thePredessors);
if (predessorLength == 0)
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
| private int | findEnding(java.lang.StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)Finds ending among given ending class and returns the length of ending found(0, if not found).
Creation date: (17/03/2002 8:18:34 PM)
boolean match = false;
for (int i = theEndingClass.length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.length - 1; j >= 0; j--)
{
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].length; // cut ending
}
}
return 0;
| private int | findEnding(java.lang.StringBuffer stemmingZone, char[][] theEndingClass)
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
| private boolean | isVowel(char letter)Checks if character is a vowel..
Creation date: (16/03/2002 10:47:03 PM)
for (int i = 0; i < vowels.length; i++)
{
if (letter == charset[vowels[i]])
return true;
}
return false;
| private void | markPositions(java.lang.String word)Marks positions of RV, R1 and R2 in a given word.
Creation date: (16/03/2002 3:40:11 PM)
RV = 0;
R1 = 0;
R2 = 0;
int i = 0;
// find RV
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // RV zone is empty
RV = i;
// find R1
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R1 zone is empty
R1 = i;
// find R2
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
R2 = i;
| private boolean | noun(java.lang.StringBuffer stemmingZone)Noun endings.
Creation date: (17/03/2002 12:14:58 AM)
return findAndRemoveEnding(stemmingZone, nounEndings);
| private boolean | perfectiveGerund(java.lang.StringBuffer stemmingZone)Perfective gerund endings.
Creation date: (17/03/2002 12:14:58 AM)
return findAndRemoveEnding(
stemmingZone,
perfectiveGerundEndings1,
perfectiveGerund1Predessors)
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
| private boolean | reflexive(java.lang.StringBuffer stemmingZone)Reflexive endings.
Creation date: (17/03/2002 12:14:58 AM)
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
| private boolean | removeI(java.lang.StringBuffer stemmingZone)Insert the method's description here.
Creation date: (17/03/2002 12:14:58 AM)
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
| private boolean | removeSoft(java.lang.StringBuffer stemmingZone)Insert the method's description here.
Creation date: (17/03/2002 12:14:58 AM)
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
| public void | setCharset(char[] newCharset)Insert the method's description here.
Creation date: (16/03/2002 10:58:42 PM)
charset = newCharset;
| private void | setEndings()Set ending definition as in Russian stemming algorithm.
Creation date: (16/03/2002 11:16:36 PM)
vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
perfectiveGerundEndings1 = new char[][] {
{ V }, { V, SH, I }, { V, SH, I, S, SOFT }
};
perfectiveGerund1Predessors = new char[][] { { A }, { IA }
};
perfectiveGerundEndings2 = new char[][] {
{ I, V },
{ Y, V },
{ I, V, SH, I },
{ Y, V, SH, I },
{ I, V, SH, I, S, SOFT },
{ Y, V, SH, I, S, SOFT }
};
adjectiveEndings = new char[][] {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{ O, M, U }
};
participleEndings1 = new char[][] {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
participleEndings2 = new char[][] {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
participle1Predessors = new char[][] {
{ A },
{ IA }
};
reflexiveEndings = new char[][] {
{ S, IA },
{ S, SOFT }
};
verbEndings1 = new char[][] {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
verbEndings2 = new char[][] {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
verb1Predessors = new char[][] {
{ A },
{ IA }
};
nounEndings = new char[][] {
{ A },
{ IU },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
superlativeEndings = new char[][] {
{ E, I_, SH },
{ E, I_, SH, E }
};
derivationalEndings = new char[][] {
{ O, S, T },
{ O, S, T, SOFT }
};
| public java.lang.String | stem(java.lang.String input)Finds the stem for given Russian word.
Creation date: (16/03/2002 3:36:48 PM)
markPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
// stemming goes on in RV
// Step 1
if (!perfectiveGerund(stemmingZone))
{
reflexive(stemmingZone);
boolean r =
adjectival(stemmingZone)
|| verb(stemmingZone)
|| noun(stemmingZone);
}
// Step 2
removeI(stemmingZone);
// Step 3
derivational(stemmingZone);
// Step 4
superlative(stemmingZone);
undoubleN(stemmingZone);
removeSoft(stemmingZone);
// return result
return input.substring(0, RV) + stemmingZone.toString();
| public static java.lang.String | stem(java.lang.String theWord, char[] charset)Static method for stemming with different charsets
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(charset);
return stemmer.stem(theWord);
| private boolean | superlative(java.lang.StringBuffer stemmingZone)Superlative endings.
Creation date: (17/03/2002 12:14:58 AM)
return findAndRemoveEnding(stemmingZone, superlativeEndings);
| private boolean | undoubleN(java.lang.StringBuffer stemmingZone)Undoubles N.
Creation date: (17/03/2002 12:14:58 AM)
char[][] doubleN = {
{ N, N }
};
if (findEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
| private boolean | verb(java.lang.StringBuffer stemmingZone)Verb endings.
Creation date: (17/03/2002 12:14:58 AM)
return findAndRemoveEnding(
stemmingZone,
verbEndings1,
verb1Predessors)
|| findAndRemoveEnding(stemmingZone, verbEndings2);
|
|