FileDocCategorySizeDatePackage
FrenchStemmer.javaAPI DocApache Lucene 2.1.019941Wed Feb 14 10:46:28 GMT 2007org.apache.lucene.analysis.fr

FrenchStemmer

public class FrenchStemmer extends Object
A stemmer for French words. The algorithm is based on the work of Dr Martin Porter on his snowball project
refer to http://snowball.sourceforge.net/french/stemmer.html
(French stemming algorithm) for details
author
Patrick Talbot

Fields Summary
private StringBuffer
sb
Buffer for the terms while stemming them.
private StringBuffer
tb
A temporary buffer, used to reconstruct R2
private String
R0
Region R0 is equal to the whole buffer
private String
RV
Region RV "If the word begins with two vowels, RV is the region after the third letter, otherwise the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found."
private String
R1
Region R1 "R1 is the region after the first non-vowel following a vowel or is the null region at the end of the word if there is no such non-vowel"
private String
R2
Region R2 "R2 is the region after the first non-vowel in R1 following a vowel or is the null region at the end of the word if there is no such non-vowel"
private boolean
suite
Set to true if we need to perform step 2
private boolean
modified
Set to true if the buffer was modified
Constructors Summary
Methods Summary
private voiddeleteButSuffixFrom(java.lang.String source, java.lang.String[] search, java.lang.String prefix, boolean without)
Delete a suffix searched in zone "source" if preceded by the prefix

param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
prefix java.lang.String - the prefix to add to the search string to test
param
without boolean - true if it will be deleted even without prefix found

		if (source!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( prefix + search[i] ))
				{
					sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
					modified = true;
					setStrings();
					break;
				}
				else if ( without && source.endsWith( search[i] ))
				{
					sb.delete( sb.length() - search[i].length(), sb.length() );
					modified = true;
					setStrings();
					break;
				}
			}
		}
	
private voiddeleteButSuffixFromElseReplace(java.lang.String source, java.lang.String[] search, java.lang.String prefix, boolean without, java.lang.String from, java.lang.String replace)
Delete a suffix searched in zone "source" if preceded by prefix
or replace it with the replace string if preceded by the prefix in the zone "from"
or delete the suffix if specified

param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
prefix java.lang.String - the prefix to add to the search string to test
param
without boolean - true if it will be deleted even without prefix found

		if (source!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( prefix + search[i] ))
				{
					sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
					modified = true;
					setStrings();
					break;
				}
				else if ( from!=null && from.endsWith( prefix + search[i] ))
				{
					sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
					modified = true;
					setStrings();
					break;
				}
				else if ( without && source.endsWith( search[i] ))
				{
					sb.delete( sb.length() - search[i].length(), sb.length() );
					modified = true;
					setStrings();
					break;
				}
			}
		}
	
private voiddeleteFrom(java.lang.String source, java.lang.String[] suffix)
Delete a search string within the source zone

param
source the source zone for search
param
suffix the strings to search for suppression

		if (source!=null)
		{
			for (int i = 0; i < suffix.length; i++) {
				if (source.endsWith( suffix[i] ))
				{
					sb.delete( sb.length() - suffix[i].length(), sb.length());
					modified = true;
					setStrings();
					break;
				}
			}
		}
	
private booleandeleteFromIfPrecededIn(java.lang.String source, java.lang.String[] search, java.lang.String from, java.lang.String prefix)
Delete a suffix searched in zone "source" if zone "from" contains prefix + search string

param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
from java.lang.String - the secondary source zone for search
param
prefix java.lang.String - the prefix to add to the search string to test
return
boolean - true if modified

		boolean found = false;
		if (source!=null )
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( search[i] ))
				{
					if (from!=null && from.endsWith( prefix + search[i] ))
					{
						sb.delete( sb.length() - search[i].length(), sb.length());
						found = true;
						setStrings();
						break;
					}
				}
			}
		}
		return found;
	
private booleandeleteFromIfTestVowelBeforeIn(java.lang.String source, java.lang.String[] search, boolean vowel, java.lang.String from)
Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel

param
source java.lang.String - the primary source zone for search
param
search java.lang.String[] - the strings to search for suppression
param
vowel boolean - true if we need a vowel before the search string
param
from java.lang.String - the secondary source zone for search (where vowel could be)
return
boolean - true if modified

		boolean found = false;
		if (source!=null && from!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( search[i] ))
				{
					if ((search[i].length() + 1) <= from.length())
					{
						boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
						if (test == vowel)
						{
							sb.delete( sb.length() - search[i].length(), sb.length());
							modified = true;
							found = true;
							setStrings();
							break;
						}
					}
				}
			}
		}
		return found;
	
private booleanisStemmable(java.lang.String term)
Checks a term if it can be processed correctly.

return
boolean - true if, and only if, the given term consists in letters.

		boolean upper = false;
		int first = -1;
		for ( int c = 0; c < term.length(); c++ ) {
			// Discard terms that contain non-letter characters.
			if ( !Character.isLetter( term.charAt( c ) ) ) {
				return false;
			}
			// Discard terms that contain multiple uppercase letters.
			if ( Character.isUpperCase( term.charAt( c ) ) ) {
				if ( upper ) {
					return false;
				}
			// First encountered uppercase letter, set flag and save
			// position.
				else {
					first = c;
					upper = true;
				}
			}
		}
		// Discard the term if it contains a single uppercase letter that
		// is not starting the term.
		if ( first > 0 ) {
			return false;
		}
		return true;
    
private booleanisVowel(char ch)
Test if a char is a french vowel, including accentuated ones

param
ch the char to test
return
boolean - true if the char is a vowel

		switch (ch)
		{
			case 'a":
			case 'e":
			case 'i":
			case 'o":
			case 'u":
			case 'y":
			case 'â":
			case 'à":
			case 'ë":
			case 'é":
			case 'ê":
			case 'è":
			case 'ï":
			case 'î":
			case 'ô":
			case 'ü":
			case 'ù":
			case 'û":
				return true;
			default:
				return false;
		}
	
private booleanreplaceFrom(java.lang.String source, java.lang.String[] search, java.lang.String replace)
Replace a search string with another within the source zone

param
source java.lang.String - the source zone for search
param
search java.lang.String[] - the strings to search for replacement
param
replace java.lang.String - the replacement string

		boolean found = false;
		if (source!=null)
		{
			for (int i = 0; i < search.length; i++) {
				if ( source.endsWith( search[i] ))
				{
					sb.replace( sb.length() - search[i].length(), sb.length(), replace );
					modified = true;
					found = true;
					setStrings();
					break;
				}
			}
		}
		return found;
	
private java.lang.StringretrieveR(java.lang.StringBuffer buffer)
Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string
"R is the region after the first non-vowel following a vowel or is the null region at the end of the word if there is no such non-vowel"

param
buffer java.lang.StringBuffer - the in buffer
return
java.lang.String - the resulting string

		int len = buffer.length();
		int pos = -1;
		for (int c = 0; c < len; c++) {
			if (isVowel( buffer.charAt( c )))
			{
				pos = c;
				break;
			}
		}
		if (pos > -1)
		{
			int consonne = -1;
			for (int c = pos; c < len; c++) {
				if (!isVowel(buffer.charAt( c )))
				{
					consonne = c;
					break;
				}
			}
			if (consonne > -1 && (consonne+1) < len)
				return buffer.substring( consonne+1, len );
			else
				return null;
		}
		else
			return null;
	
private java.lang.StringretrieveRV(java.lang.StringBuffer buffer)
Retrieve the "RV zone" from a buffer an return the corresponding string
"If the word begins with two vowels, RV is the region after the third letter, otherwise the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found."

param
buffer java.lang.StringBuffer - the in buffer
return
java.lang.String - the resulting string

		int len = buffer.length();
		if ( buffer.length() > 3)
		{
			if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
				return buffer.substring(3,len);
			}
			else
			{
				int pos = 0;
				for (int c = 1; c < len; c++) {
					if (isVowel( buffer.charAt( c )))
					{
						pos = c;
						break;
					}
				}
				if ( pos+1 < len )
					return buffer.substring( pos+1, len );
				else
					return null;
			}
		}
		else
			return null;
	
private voidsetStrings()
Sets the search region Strings
it needs to be done each time the buffer was modified

		// set the strings
		R0 = sb.toString();
		RV = retrieveRV( sb );
		R1 = retrieveR( sb );
		if ( R1 != null )
		{
			tb.delete( 0, tb.length() );
			tb.insert( 0, R1 );
			R2 = retrieveR( tb );
		}
		else
			R2 = null;
	
protected java.lang.Stringstem(java.lang.String term)
Stemms the given term to a unique discriminator.

param
term java.langString The term that should be stemmed
return
java.lang.String Discriminator for term



                                 
          
		if ( !isStemmable( term ) ) {
			return term;
		}

		// Use lowercase for medium stemming.
		term = term.toLowerCase();

		// Reset the StringBuffer.
		sb.delete( 0, sb.length() );
		sb.insert( 0, term );

		// reset the booleans
		modified = false;
		suite = false;

		sb = treatVowels( sb );

		setStrings();

		step1();

		if (!modified || suite)
		{
			if (RV != null)
			{
				suite = step2a();
				if (!suite)
					step2b();
			}
		}

		if (modified || suite)
			step3();
		else
			step4();

		step5();

		step6();

		return sb.toString();
    
private voidstep1()
First step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

		String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
		deleteFrom( R2, suffix );

		replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
		replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
		replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );

		String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
		deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );

		deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );

		deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
		deleteFrom( RV, new String[] { "ements", "ement" } );

		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
		deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );

		String[] autre = { "ifs", "ives", "if", "ive" };
		deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
		deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );

		replaceFrom( R0, new String[] { "eaux" }, "eau" );

		replaceFrom( R1, new String[] { "aux" }, "al" );

		deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );

		deleteFrom( R2, new String[] { "eux" } );

		// if one of the next steps is performed, we will need to perform step2a
		boolean temp = false;
		temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
		if (temp == true)
			suite = true;
		temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
		if (temp == true)
			suite = true;
		temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
		if (temp == true)
			suite = true;

	
private booleanstep2a()
Second step (A) of the Porter Algorithmn
Will be performed if nothing changed from the first step or changed were done in the amment, emment, ments or ment suffixes
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

return
boolean - true if something changed in the StringBuffer

		String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
							"irent", "iriez", "irez", "irions", "irons", "iront",
							"issaIent", "issais", "issantes", "issante", "issants", "issant",
							"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
							"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
		return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
	
private voidstep2b()
Second step (B) of the Porter Algorithmn
Will be performed if step 2 A was performed unsuccessfully
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

		String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
							"erons", "eront","erez", "èrent", "era", "ées", "iez",
							"ée", "és", "er", "ez", "é" };
		deleteFrom( RV, suffix );

		String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
							"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
							"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
		deleteButSuffixFrom( RV, search, "e", true );

		deleteFrom( R2, new String[] { "ions" } );
	
private voidstep3()
Third step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

		if (sb.length()>0)
		{
			char ch = sb.charAt( sb.length()-1 );
			if (ch == 'Y")
			{
				sb.setCharAt( sb.length()-1, 'i" );
				setStrings();
			}
			else if (ch == 'ç")
			{
				sb.setCharAt( sb.length()-1, 'c" );
				setStrings();
			}
		}
	
private voidstep4()
Fourth step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

		if (sb.length() > 1)
		{
			char ch = sb.charAt( sb.length()-1 );
			if (ch == 's")
			{
				char b = sb.charAt( sb.length()-2 );
				if (b != 'a" && b != 'i" && b != 'o" && b != 'u" && b != 'è" && b != 's")
				{
					sb.delete( sb.length() - 1, sb.length());
					setStrings();
				}
			}
		}
		boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
		if (!found)
		found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );

		replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
		deleteFrom( RV, new String[] { "e" } );
		deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
	
private voidstep5()
Fifth step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

		if (R0 != null)
		{
			if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
			{
				sb.delete( sb.length() - 1, sb.length() );
				setStrings();
			}
		}
	
private voidstep6()
Sixth (and last!) step of the Porter Algorithmn
refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation

		if (R0!=null && R0.length()>0)
		{
			boolean seenVowel = false;
			boolean seenConson = false;
			int pos = -1;
			for (int i = R0.length()-1; i > -1; i--)
			{
				char ch = R0.charAt(i);
				if (isVowel(ch))
				{
					if (!seenVowel)
					{
						if (ch == 'é" || ch == 'è")
						{
							pos = i;
							break;
						}
					}
					seenVowel = true;
				}
				else
				{
					if (seenVowel)
						break;
					else
						seenConson = true;
				}
			}
			if (pos > -1 && seenConson && !seenVowel)
				sb.setCharAt(pos, 'e");
		}
	
private java.lang.StringBuffertreatVowels(java.lang.StringBuffer buffer)
Turns u and i preceded AND followed by a vowel to UpperCase
Turns y preceded OR followed by a vowel to UpperCase
Turns u preceded by q to UpperCase

param
buffer java.util.StringBuffer - the buffer to treat
return
java.util.StringBuffer - the treated buffer

		for ( int c = 0; c < buffer.length(); c++ ) {
			char ch = buffer.charAt( c );

			if (c == 0) // first char
			{
				if (buffer.length()>1)
				{
					if (ch == 'y" && isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'Y" );
				}
			}
			else if (c == buffer.length()-1) // last char
			{
				if (ch == 'u" && buffer.charAt( c - 1 ) == 'q")
					buffer.setCharAt( c, 'U" );
				if (ch == 'y" && isVowel(buffer.charAt( c - 1 )))
					buffer.setCharAt( c, 'Y" );
			}
			else // other cases
			{
				if (ch == 'u")
				{
					if (buffer.charAt( c - 1) == 'q")
						buffer.setCharAt( c, 'U" );
					else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'U" );
				}
				if (ch == 'i")
				{
					if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'I" );
				}
				if (ch == 'y")
				{
					if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
						buffer.setCharAt( c, 'Y" );
				}
			}
		}

		return buffer;