File Doc Category Size Date Package
FrenchStemmer.java API Doc Apache Lucene 1.9 19758 Mon Feb 20 09:18:52 GMT 2006 org.apache.lucene.analysis.fr

FrenchStemmer

java.lang.Object

public class FrenchStemmer extends Object

A stemmer for French words. The algorithm is based on the work of Dr Martin Porter on his snowball project
refer to http://snowball.sourceforge.net/french/stemmer.html
(French stemming algorithm) for details

author: Patrick Talbot

Fields Summary
private StringBuffer
sb
Buffer for the terms while stemming them.
private StringBuffer
tb
A temporary buffer, used to reconstruct R2
private String
R0
Region R0 is equal to the whole buffer
private String
RV
Region RV "If the word begins with two vowels, RV is the region after the third letter, otherwise the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found."
private String
R1
Region R1 "R1 is the region after the first non-vowel following a vowel or is the null region at the end of the word if there is no such non-vowel"
private String
R2
Region R2 "R2 is the region after the first non-vowel in R1 following a vowel or is the null region at the end of the word if there is no such non-vowel"
private boolean
suite
Set to true if we need to perform step 2
private boolean
modified
Set to true if the buffer was modified
Constructors Summary
Methods Summary
private void deleteButSuffixFrom(java.lang.String source, java.lang.String[] search, java.lang.String prefix, boolean without)
Delete a suffix searched in zone "source" if preceded by the prefix
param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
prefix java.lang.String - the prefix to add to the search string to test
param
without boolean - true if it will be deleted even without prefix found
if (source!=null) { for (int i = 0; i < search.length; i++) { if ( source.endsWith( prefix + search[i] )) { sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); modified = true; setStrings(); break; } else if ( without && source.endsWith( search[i] )) { sb.delete( sb.length() - search[i].length(), sb.length() ); modified = true; setStrings(); break; } } }
private void deleteButSuffixFromElseReplace(java.lang.String source, java.lang.String[] search, java.lang.String prefix, boolean without, java.lang.String from, java.lang.String replace)
Delete a suffix searched in zone "source" if preceded by prefix
or replace it with the replace string if preceded by the prefix in the zone "from"
or delete the suffix if specified
param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
prefix java.lang.String - the prefix to add to the search string to test
param
without boolean - true if it will be deleted even without prefix found
if (source!=null) { for (int i = 0; i < search.length; i++) { if ( source.endsWith( prefix + search[i] )) { sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); modified = true; setStrings(); break; } else if ( from!=null && from.endsWith( prefix + search[i] )) { sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace ); modified = true; setStrings(); break; } else if ( without && source.endsWith( search[i] )) { sb.delete( sb.length() - search[i].length(), sb.length() ); modified = true; setStrings(); break; } } }
private void deleteFrom(java.lang.String source, java.lang.String[] suffix)
Delete a search string within the source zone
param
source the source zone for search
param
suffix the strings to search for suppression
if (source!=null) { for (int i = 0; i < suffix.length; i++) { if (source.endsWith( suffix[i] )) { sb.delete( sb.length() - suffix[i].length(), sb.length()); modified = true; setStrings(); break; } } }
private boolean deleteFromIfPrecededIn(java.lang.String source, java.lang.String[] search, java.lang.String from, java.lang.String prefix)
Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
from java.lang.String - the secondary source zone for search
param
prefix java.lang.String - the prefix to add to the search string to test
return
boolean - true if modified
boolean found = false; if (source!=null ) { for (int i = 0; i < search.length; i++) { if ( source.endsWith( search[i] )) { if (from!=null && from.endsWith( prefix + search[i] )) { sb.delete( sb.length() - search[i].length(), sb.length()); found = true; setStrings(); break; } } } } return found;
private boolean deleteFromIfTestVowelBeforeIn(java.lang.String source, java.lang.String[] search, boolean vowel, java.lang.String from)
Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
vowel boolean - true if we need a vowel before the search string
param
from java.lang.String - the secondary source zone for search (where vowel could be)
return
boolean - true if modified
boolean found = false; if (source!=null && from!=null) { for (int i = 0; i < search.length; i++) { if ( source.endsWith( search[i] )) { if ((search[i].length() + 1) <= from.length()) { boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1))); if (test == vowel) { sb.delete( sb.length() - search[i].length(), sb.length()); modified = true; found = true; setStrings(); break; } } } } } return found;
private boolean isStemmable(java.lang.String term)
Checks a term if it can be processed correctly.
return
boolean - true if, and only if, the given term consists in letters.
boolean upper = false; int first = -1; for ( int c = 0; c < term.length(); c++ ) { // Discard terms that contain non-letter characters. if ( !Character.isLetter( term.charAt( c ) ) ) { return false; } // Discard terms that contain multiple uppercase letters. if ( Character.isUpperCase( term.charAt( c ) ) ) { if ( upper ) { return false; } // First encountered uppercase letter, set flag and save // position. else { first = c; upper = true; } } } // Discard the term if it contains a single uppercase letter that // is not starting the term. if ( first > 0 ) { return false; } return true;
private boolean isVowel(char ch)
Test if a char is a french vowel, including accentuated ones
param
ch the char to test
return
boolean - true if the char is a vowel
switch (ch) { case 'a": case 'e": case 'i": case 'o": case 'u": case 'y": case 'â": case 'à": case 'ë": case 'é": case 'ê": case 'è": case 'ï": case 'î": case 'ô": case 'ü": case 'ù": case 'û": return true; default: return false; }
private boolean replaceFrom(java.lang.String source, java.lang.String[] search, java.lang.String replace)
Replace a search string with another within the source zone
param
source java.lang.String - the source zone for search
param
search java.lang.String[] - the strings to search for replacement
param
replace java.lang.String - the replacement string
boolean found = false; if (source!=null) { for (int i = 0; i < search.length; i++) { if ( source.endsWith( search[i] )) { sb.replace( sb.length() - search[i].length(), sb.length(), replace ); modified = true; found = true; setStrings(); break; } } } return found;
private java.lang.String retrieveR(java.lang.StringBuffer buffer)
Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
"R is the region after the first non-vowel following a vowel or is the null region at the end of the word if there is no such non-vowel"
param
buffer java.lang.StringBuffer - the in buffer
return
java.lang.String - the resulting string
int len = buffer.length(); int pos = -1; for (int c = 0; c < len; c++) { if (isVowel( buffer.charAt( c ))) { pos = c; break; } } if (pos > -1) { int consonne = -1; for (int c = pos; c < len; c++) { if (!isVowel(buffer.charAt( c ))) { consonne = c; break; } } if (consonne > -1 && (consonne+1) < len) return buffer.substring( consonne+1, len ); else return null; } else return null;
private java.lang.String retrieveRV(java.lang.StringBuffer buffer)
Retrieve the "RV zone" from a buffer an return the corresponding string
"If the word begins with two vowels, RV is the region after the third letter, otherwise the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found."
param
buffer java.lang.StringBuffer - the in buffer
return
java.lang.String - the resulting string
int len = buffer.length(); if ( buffer.length() > 3) { if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) { return buffer.substring(3,len); } else { int pos = 0; for (int c = 1; c < len; c++) { if (isVowel( buffer.charAt( c ))) { pos = c; break; } } if ( pos+1 < len ) return buffer.substring( pos+1, len ); else return null; } } else return null;
private void setStrings()
Sets the search region Strings
it needs to be done each time the buffer was modified
// set the strings R0 = sb.toString(); RV = retrieveRV( sb ); R1 = retrieveR( sb ); if ( R1 != null ) { tb.delete( 0, tb.length() ); tb.insert( 0, R1 ); R2 = retrieveR( tb ); } else R2 = null;
protected java.lang.String stem(java.lang.String term)
Stemms the given term to a unique discriminator.
param
term java.langString The term that should be stemmed
return
java.lang.String Discriminator for term
if ( !isStemmable( term ) ) { return term; } // Use lowercase for medium stemming. term = term.toLowerCase(); // Reset the StringBuffer. sb.delete( 0, sb.length() ); sb.insert( 0, term ); // reset the booleans modified = false; suite = false; sb = treatVowels( sb ); setStrings(); step1(); if (!modified || suite) { if (RV != null) { suite = step2a(); if (!suite) step2b(); } } if (modified || suite) step3(); else step4(); step5(); step6(); return sb.toString();
private void step1()
First step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; deleteFrom( R2, suffix ); replaceFrom( R2, new String[] { "logies", "logie" }, "log" ); replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); replaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); deleteFrom( RV, new String[] { "ements", "ement" } ); deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" ); deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" ); deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true ); String[] autre = { "ifs", "ives", "if", "ive" }; deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); replaceFrom( R0, new String[] { "eaux" }, "eau" ); replaceFrom( R1, new String[] { "aux" }, "al" ); deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); deleteFrom( R2, new String[] { "eux" } ); // if one of the next steps is performed, we will need to perform step2a boolean temp = false; temp = replaceFrom( RV, new String[] { "amment" }, "ant" ); if (temp == true) suite = true; temp = replaceFrom( RV, new String[] { "emment" }, "ent" ); if (temp == true) suite = true; temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); if (temp == true) suite = true;
private boolean step2a()
Second step (A) of the Porter Algorithmn
Will be performed if nothing changed from the first step or changed were done in the amment, emment, ments or ment suffixes
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
return
boolean - true if something changed in the StringBuffer
String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira", "irent", "iriez", "irez", "irions", "irons", "iront", "issaIent", "issais", "issantes", "issante", "issants", "issant", "issait", "issais", "issions", "issons", "issiez", "issez", "issent", "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" }; return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
private void step2b()
Second step (B) of the Porter Algorithmn
Will be performed if step 2 A was performed unsuccessfully
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", "erons", "eront","erez", "èrent", "era", "ées", "iez", "ée", "és", "er", "ez", "é" }; deleteFrom( RV, suffix ); String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" }; deleteButSuffixFrom( RV, search, "e", true ); deleteFrom( R2, new String[] { "ions" } );
private void step3()
Third step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (sb.length()>0) { char ch = sb.charAt( sb.length()-1 ); if (ch == 'Y") { sb.setCharAt( sb.length()-1, 'i" ); setStrings(); } else if (ch == 'ç") { sb.setCharAt( sb.length()-1, 'c" ); setStrings(); } }
private void step4()
Fourth step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (sb.length() > 1) { char ch = sb.charAt( sb.length()-1 ); if (ch == 's") { char b = sb.charAt( sb.length()-2 ); if (b != 'a" && b != 'i" && b != 'o" && b != 'u" && b != 'è" && b != 's") { sb.delete( sb.length() - 1, sb.length()); setStrings(); } } } boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); if (!found) found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" ); deleteFrom( RV, new String[] { "e" } ); deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
private void step5()
Fifth step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (R0 != null) { if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) { sb.delete( sb.length() - 1, sb.length() ); setStrings(); } }
private void step6()
Sixth (and last!) step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
if (R0!=null && R0.length()>0) { boolean seenVowel = false; boolean seenConson = false; int pos = -1; for (int i = R0.length()-1; i > -1; i--) { char ch = R0.charAt(i); if (isVowel(ch)) { if (!seenVowel) { if (ch == 'é" || ch == 'è") { pos = i; break; } } seenVowel = true; } else { if (seenVowel) break; else seenConson = true; } } if (pos > -1 && seenConson && !seenVowel) sb.setCharAt(pos, 'e"); }
private java.lang.StringBuffer treatVowels(java.lang.StringBuffer buffer)
Turns u and i preceded AND followed by a vowel to UpperCase
Turns y preceded OR followed by a vowel to UpperCase
Turns u preceded by q to UpperCase
param
buffer java.util.StringBuffer - the buffer to treat
return
java.util.StringBuffer - the treated buffer
for ( int c = 0; c < buffer.length(); c++ ) { char ch = buffer.charAt( c ); if (c == 0) // first char { if (buffer.length()>1) { if (ch == 'y" && isVowel(buffer.charAt( c + 1 ))) buffer.setCharAt( c, 'Y" ); } } else if (c == buffer.length()-1) // last char { if (ch == 'u" && buffer.charAt( c - 1 ) == 'q") buffer.setCharAt( c, 'U" ); if (ch == 'y" && isVowel(buffer.charAt( c - 1 ))) buffer.setCharAt( c, 'Y" ); } else // other cases { if (ch == 'u") { if (buffer.charAt( c - 1) == 'q") buffer.setCharAt( c, 'U" ); else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) buffer.setCharAt( c, 'U" ); } if (ch == 'i") { if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) buffer.setCharAt( c, 'I" ); } if (ch == 'y") { if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 ))) buffer.setCharAt( c, 'Y" ); } } } return buffer;