FrenchStemmerpublic class FrenchStemmer extends Object A stemmer for French words. The algorithm is based on the work of
Dr Martin Porter on his snowball project
refer to http://snowball.sourceforge.net/french/stemmer.html
(French stemming algorithm) for details |
Fields Summary |
---|
private StringBuffer | sbBuffer for the terms while stemming them. | private StringBuffer | tbA temporary buffer, used to reconstruct R2 | private String | R0Region R0 is equal to the whole buffer | private String | RVRegion RV
"If the word begins with two vowels, RV is the region after the third letter,
otherwise the region after the first vowel not at the beginning of the word,
or the end of the word if these positions cannot be found." | private String | R1Region R1
"R1 is the region after the first non-vowel following a vowel
or is the null region at the end of the word if there is no such non-vowel" | private String | R2Region R2
"R2 is the region after the first non-vowel in R1 following a vowel
or is the null region at the end of the word if there is no such non-vowel" | private boolean | suiteSet to true if we need to perform step 2 | private boolean | modifiedSet to true if the buffer was modified |
Methods Summary |
---|
private void | deleteButSuffixFrom(java.lang.String source, java.lang.String[] search, java.lang.String prefix, boolean without)Delete a suffix searched in zone "source" if preceded by the prefix
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
modified = true;
setStrings();
break;
}
else if ( without && source.endsWith( search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length() );
modified = true;
setStrings();
break;
}
}
}
| private void | deleteButSuffixFromElseReplace(java.lang.String source, java.lang.String[] search, java.lang.String prefix, boolean without, java.lang.String from, java.lang.String replace)Delete a suffix searched in zone "source" if preceded by prefix
or replace it with the replace string if preceded by the prefix in the zone "from"
or delete the suffix if specified
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
modified = true;
setStrings();
break;
}
else if ( from!=null && from.endsWith( prefix + search[i] ))
{
sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
modified = true;
setStrings();
break;
}
else if ( without && source.endsWith( search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length() );
modified = true;
setStrings();
break;
}
}
}
| private void | deleteFrom(java.lang.String source, java.lang.String[] suffix)Delete a search string within the source zone
if (source!=null)
{
for (int i = 0; i < suffix.length; i++) {
if (source.endsWith( suffix[i] ))
{
sb.delete( sb.length() - suffix[i].length(), sb.length());
modified = true;
setStrings();
break;
}
}
}
| private boolean | deleteFromIfPrecededIn(java.lang.String source, java.lang.String[] search, java.lang.String from, java.lang.String prefix)Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
boolean found = false;
if (source!=null )
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
if (from!=null && from.endsWith( prefix + search[i] ))
{
sb.delete( sb.length() - search[i].length(), sb.length());
found = true;
setStrings();
break;
}
}
}
}
return found;
| private boolean | deleteFromIfTestVowelBeforeIn(java.lang.String source, java.lang.String[] search, boolean vowel, java.lang.String from)Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
boolean found = false;
if (source!=null && from!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
if ((search[i].length() + 1) <= from.length())
{
boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
if (test == vowel)
{
sb.delete( sb.length() - search[i].length(), sb.length());
modified = true;
found = true;
setStrings();
break;
}
}
}
}
}
return found;
| private boolean | isStemmable(java.lang.String term)Checks a term if it can be processed correctly.
boolean upper = false;
int first = -1;
for ( int c = 0; c < term.length(); c++ ) {
// Discard terms that contain non-letter characters.
if ( !Character.isLetter( term.charAt( c ) ) ) {
return false;
}
// Discard terms that contain multiple uppercase letters.
if ( Character.isUpperCase( term.charAt( c ) ) ) {
if ( upper ) {
return false;
}
// First encountered uppercase letter, set flag and save
// position.
else {
first = c;
upper = true;
}
}
}
// Discard the term if it contains a single uppercase letter that
// is not starting the term.
if ( first > 0 ) {
return false;
}
return true;
| private boolean | isVowel(char ch)Test if a char is a french vowel, including accentuated ones
switch (ch)
{
case 'a":
case 'e":
case 'i":
case 'o":
case 'u":
case 'y":
case 'â":
case 'à":
case 'ë":
case 'é":
case 'ê":
case 'è":
case 'ï":
case 'î":
case 'ô":
case 'ü":
case 'ù":
case 'û":
return true;
default:
return false;
}
| private boolean | replaceFrom(java.lang.String source, java.lang.String[] search, java.lang.String replace)Replace a search string with another within the source zone
boolean found = false;
if (source!=null)
{
for (int i = 0; i < search.length; i++) {
if ( source.endsWith( search[i] ))
{
sb.replace( sb.length() - search[i].length(), sb.length(), replace );
modified = true;
found = true;
setStrings();
break;
}
}
}
return found;
| private java.lang.String | retrieveR(java.lang.StringBuffer buffer)Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
"R is the region after the first non-vowel following a vowel
or is the null region at the end of the word if there is no such non-vowel"
int len = buffer.length();
int pos = -1;
for (int c = 0; c < len; c++) {
if (isVowel( buffer.charAt( c )))
{
pos = c;
break;
}
}
if (pos > -1)
{
int consonne = -1;
for (int c = pos; c < len; c++) {
if (!isVowel(buffer.charAt( c )))
{
consonne = c;
break;
}
}
if (consonne > -1 && (consonne+1) < len)
return buffer.substring( consonne+1, len );
else
return null;
}
else
return null;
| private java.lang.String | retrieveRV(java.lang.StringBuffer buffer)Retrieve the "RV zone" from a buffer an return the corresponding string
"If the word begins with two vowels, RV is the region after the third letter,
otherwise the region after the first vowel not at the beginning of the word,
or the end of the word if these positions cannot be found."
int len = buffer.length();
if ( buffer.length() > 3)
{
if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
return buffer.substring(3,len);
}
else
{
int pos = 0;
for (int c = 1; c < len; c++) {
if (isVowel( buffer.charAt( c )))
{
pos = c;
break;
}
}
if ( pos+1 < len )
return buffer.substring( pos+1, len );
else
return null;
}
}
else
return null;
| private void | setStrings()Sets the search region Strings
it needs to be done each time the buffer was modified
// set the strings
R0 = sb.toString();
RV = retrieveRV( sb );
R1 = retrieveR( sb );
if ( R1 != null )
{
tb.delete( 0, tb.length() );
tb.insert( 0, R1 );
R2 = retrieveR( tb );
}
else
R2 = null;
| protected java.lang.String | stem(java.lang.String term)Stemms the given term to a unique discriminator.
if ( !isStemmable( term ) ) {
return term;
}
// Use lowercase for medium stemming.
term = term.toLowerCase();
// Reset the StringBuffer.
sb.delete( 0, sb.length() );
sb.insert( 0, term );
// reset the booleans
modified = false;
suite = false;
sb = treatVowels( sb );
setStrings();
step1();
if (!modified || suite)
{
if (RV != null)
{
suite = step2a();
if (!suite)
step2b();
}
}
if (modified || suite)
step3();
else
step4();
step5();
step6();
return sb.toString();
| private void | step1()First step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
deleteFrom( R2, suffix );
replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
deleteFrom( RV, new String[] { "ements", "ement" } );
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
String[] autre = { "ifs", "ives", "if", "ive" };
deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
replaceFrom( R0, new String[] { "eaux" }, "eau" );
replaceFrom( R1, new String[] { "aux" }, "al" );
deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
deleteFrom( R2, new String[] { "eux" } );
// if one of the next steps is performed, we will need to perform step2a
boolean temp = false;
temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
if (temp == true)
suite = true;
temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
if (temp == true)
suite = true;
temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
if (temp == true)
suite = true;
| private boolean | step2a()Second step (A) of the Porter Algorithmn
Will be performed if nothing changed from the first step
or changed were done in the amment, emment, ments or ment suffixes
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
"irent", "iriez", "irez", "irions", "irons", "iront",
"issaIent", "issais", "issantes", "issante", "issants", "issant",
"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
| private void | step2b()Second step (B) of the Porter Algorithmn
Will be performed if step 2 A was performed unsuccessfully
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
"erons", "eront","erez", "èrent", "era", "ées", "iez",
"ée", "és", "er", "ez", "é" };
deleteFrom( RV, suffix );
String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
deleteButSuffixFrom( RV, search, "e", true );
deleteFrom( R2, new String[] { "ions" } );
| private void | step3()Third step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (sb.length()>0)
{
char ch = sb.charAt( sb.length()-1 );
if (ch == 'Y")
{
sb.setCharAt( sb.length()-1, 'i" );
setStrings();
}
else if (ch == 'ç")
{
sb.setCharAt( sb.length()-1, 'c" );
setStrings();
}
}
| private void | step4()Fourth step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (sb.length() > 1)
{
char ch = sb.charAt( sb.length()-1 );
if (ch == 's")
{
char b = sb.charAt( sb.length()-2 );
if (b != 'a" && b != 'i" && b != 'o" && b != 'u" && b != 'è" && b != 's")
{
sb.delete( sb.length() - 1, sb.length());
setStrings();
}
}
}
boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
if (!found)
found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
deleteFrom( RV, new String[] { "e" } );
deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
| private void | step5()Fifth step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (R0 != null)
{
if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
{
sb.delete( sb.length() - 1, sb.length() );
setStrings();
}
}
| private void | step6()Sixth (and last!) step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (R0!=null && R0.length()>0)
{
boolean seenVowel = false;
boolean seenConson = false;
int pos = -1;
for (int i = R0.length()-1; i > -1; i--)
{
char ch = R0.charAt(i);
if (isVowel(ch))
{
if (!seenVowel)
{
if (ch == 'é" || ch == 'è")
{
pos = i;
break;
}
}
seenVowel = true;
}
else
{
if (seenVowel)
break;
else
seenConson = true;
}
}
if (pos > -1 && seenConson && !seenVowel)
sb.setCharAt(pos, 'e");
}
| private java.lang.StringBuffer | treatVowels(java.lang.StringBuffer buffer)Turns u and i preceded AND followed by a vowel to UpperCase
Turns y preceded OR followed by a vowel to UpperCase
Turns u preceded by q to UpperCase
for ( int c = 0; c < buffer.length(); c++ ) {
char ch = buffer.charAt( c );
if (c == 0) // first char
{
if (buffer.length()>1)
{
if (ch == 'y" && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'Y" );
}
}
else if (c == buffer.length()-1) // last char
{
if (ch == 'u" && buffer.charAt( c - 1 ) == 'q")
buffer.setCharAt( c, 'U" );
if (ch == 'y" && isVowel(buffer.charAt( c - 1 )))
buffer.setCharAt( c, 'Y" );
}
else // other cases
{
if (ch == 'u")
{
if (buffer.charAt( c - 1) == 'q")
buffer.setCharAt( c, 'U" );
else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'U" );
}
if (ch == 'i")
{
if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'I" );
}
if (ch == 'y")
{
if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
buffer.setCharAt( c, 'Y" );
}
}
}
return buffer;
|
|