File Doc Category Size Date Package
DutchStemmer.java API Doc Apache Lucene 2.1.0 10504 Wed Feb 14 10:46:32 GMT 2007 org.apache.lucene.analysis.nl

DutchStemmer

java.lang.Object

public class DutchStemmer extends Object

A stemmer for Dutch words. The algorithm is an implementation of the dutch stemming algorithm in Martin Porter's snowball project.

author: Edwin de Jonge (ejne at cbs.nl)

Fields Summary
private StringBuffer
sb
Buffer for the terms while stemming them.
private boolean
_removedE
private Map
_stemDict
private int
_R1
private int
_R2
Constructors Summary
Methods Summary
private boolean enEnding(java.lang.StringBuffer sb)
String[] enend = new String[]{"ene", "en"}; for (int i = 0; i < enend.length; i++) { String end = enend[i]; String s = sb.toString(); int index = s.length() - end.length(); if (s.endsWith(end) && index >= _R1 && isValidEnEnding(sb, index - 1) ) { sb.delete(index, index + end.length()); unDouble(sb, index); return true; } } return false;
private int getRIndex(java.lang.StringBuffer sb, int start)
if (start == 0) start = 1; int i = start; for (; i < sb.length(); i++) { //first non-vowel preceded by a vowel if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) { return i + 1; } } return i + 1;
private boolean isStemmable(java.lang.String term)
Checks if a term could be stemmed.
return
true if, and only if, the given term consists in letters.
for (int c = 0; c < term.length(); c++) { if (!Character.isLetter(term.charAt(c))) return false; } return true;
private boolean isValidEnEnding(java.lang.StringBuffer sb, int index)
char c = sb.charAt(index); if (isVowel(c)) return false; if (c < 3) return false; // ends with "gem"? if (c == 'm" && sb.charAt(index - 2) == 'g" && sb.charAt(index - 1) == 'e") return false; return true;
private boolean isValidSEnding(java.lang.StringBuffer sb, int index)
char c = sb.charAt(index); if (isVowel(c) || c == 'j") return false; return true;
private boolean isVowel(char c)
switch (c) { case 'e": case 'a": case 'o": case 'i": case 'u": case 'y": case 'è": { return true; } } return false;
private void reStoreYandI(java.lang.StringBuffer sb)
String tmp = sb.toString(); sb.delete(0, sb.length()); sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
void setStemDictionary(java.util.Map dict)
_stemDict = dict;
public java.lang.String stem(java.lang.String term)
//TODO convert to internal /* * Stemms the given term to an unique <tt>discriminator</tt>. * * @param term The term that should be stemmed. * @return Discriminator for <tt>term</tt> */ term = term.toLowerCase(); if (!isStemmable(term)) return term; if (_stemDict != null && _stemDict.containsKey(term)) if (_stemDict.get(term) instanceof String) return (String) _stemDict.get(term); else return null; // Reset the StringBuffer. sb.delete(0, sb.length()); sb.insert(0, term); // Stemming starts here... substitute(sb); storeYandI(sb); _R1 = getRIndex(sb, 0); _R1 = Math.max(3, _R1); step1(sb); step2(sb); _R2 = getRIndex(sb, _R1); step3a(sb); step3b(sb); step4(sb); reStoreYandI(sb); return sb.toString();
private void step1(java.lang.StringBuffer sb)
if (_R1 >= sb.length()) return; String s = sb.toString(); int lengthR1 = sb.length() - _R1; int index; if (s.endsWith("heden")) { sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid")); return; } if (enEnding(sb)) return; if (s.endsWith("se") && (index = s.length() - 2) >= _R1 && isValidSEnding(sb, index - 1) ) { sb.delete(index, index + 2); return; } if (s.endsWith("s") && (index = s.length() - 1) >= _R1 && isValidSEnding(sb, index - 1)) { sb.delete(index, index + 1); }
private void step2(java.lang.StringBuffer sb)
Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
param
sb String being stemmed
_removedE = false; if (_R1 >= sb.length()) return; String s = sb.toString(); int index = s.length() - 1; if (index >= _R1 && s.endsWith("e") && !isVowel(sb.charAt(index - 1))) { sb.delete(index, index + 1); unDouble(sb); _removedE = true; }
private void step3a(java.lang.StringBuffer sb)
Delete "heid"
param
sb String being stemmed
if (_R2 >= sb.length()) return; String s = sb.toString(); int index = s.length() - 4; if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c") { sb.delete(index, index + 4); //remove heid enEnding(sb); }
private void step3b(java.lang.StringBuffer sb)
A d-suffix, or derivational suffix, enables a new word, often with a different grammatical category, or with a different sense, to be built from another word. Whether a d-suffix can be attached is discovered not from the rules of grammar, but by referring to a dictionary. So in English, ness can be added to certain adjectives to form corresponding nouns (littleness, kindness, foolishness ...) but not to all adjectives (not for example, to big, cruel, wise ...) d-suffixes can be used to change meaning, often in rather exotic ways.
Remove "ing", "end", "ig", "lijk", "baar" and "bar"
param
sb String being stemmed
if (_R2 >= sb.length()) return; String s = sb.toString(); int index = 0; if ((s.endsWith("end") || s.endsWith("ing")) && (index = s.length() - 3) >= _R2) { sb.delete(index, index + 3); if (sb.charAt(index - 2) == 'i" && sb.charAt(index - 1) == 'g") { if (sb.charAt(index - 3) != 'e" & index - 2 >= _R2) { index -= 2; sb.delete(index, index + 2); } } else { unDouble(sb, index); } return; } if (s.endsWith("ig") && (index = s.length() - 2) >= _R2 ) { if (sb.charAt(index - 1) != 'e") sb.delete(index, index + 2); return; } if (s.endsWith("lijk") && (index = s.length() - 4) >= _R2 ) { sb.delete(index, index + 4); step2(sb); return; } if (s.endsWith("baar") && (index = s.length() - 4) >= _R2 ) { sb.delete(index, index + 4); return; } if (s.endsWith("bar") && (index = s.length() - 3) >= _R2 ) { if (_removedE) sb.delete(index, index + 3); return; }
private void step4(java.lang.StringBuffer sb)
undouble vowel If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
param
sb String being stemmed
if (sb.length() < 4) return; String end = sb.substring(sb.length() - 4, sb.length()); char c = end.charAt(0); char v1 = end.charAt(1); char v2 = end.charAt(2); char d = end.charAt(3); if (v1 == v2 && d != 'I" && v1 != 'i" && isVowel(v1) && !isVowel(d) && !isVowel(c)) { sb.delete(sb.length() - 2, sb.length() - 1); }
private void storeYandI(java.lang.StringBuffer sb)
if (sb.charAt(0) == 'y") sb.setCharAt(0, 'Y"); int last = sb.length() - 1; for (int i = 1; i < last; i++) { switch (sb.charAt(i)) { case 'i": { if (isVowel(sb.charAt(i - 1)) && isVowel(sb.charAt(i + 1)) ) sb.setCharAt(i, 'I"); break; } case 'y": { if (isVowel(sb.charAt(i - 1))) sb.setCharAt(i, 'Y"); break; } } } if (last > 0 && sb.charAt(last) == 'y" && isVowel(sb.charAt(last - 1))) sb.setCharAt(last, 'Y");
private void substitute(java.lang.StringBuffer buffer)
Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
for (int i = 0; i < buffer.length(); i++) { switch (buffer.charAt(i)) { case 'ä": case 'á": { buffer.setCharAt(i, 'a"); break; } case 'ë": case 'é": { buffer.setCharAt(i, 'e"); break; } case 'ü": case 'ú": { buffer.setCharAt(i, 'u"); break; } case 'ï": case 'i": { buffer.setCharAt(i, 'i"); break; } case 'ö": case 'ó": { buffer.setCharAt(i, 'o"); break; } } }
private void unDouble(java.lang.StringBuffer sb)
unDouble(sb, sb.length());
private void unDouble(java.lang.StringBuffer sb, int endIndex)
String s = sb.substring(0, endIndex); if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) { sb.delete(endIndex - 1, endIndex); }