FileDocCategorySizeDatePackage
PorterStemmer.javaAPI DocApache Lucene 1.4.314083Tue Mar 30 00:48:00 BST 2004org.apache.lucene.analysis

PorterStemmer

public class PorterStemmer extends Object
Stemmer, implementing the Porter Stemming Algorithm The Stemmer class transforms a word into its root form. The input word can be provided a character at time (by calling add()), or at once by calling one of the various stem(something) methods.

Fields Summary
private char[]
b
private int
i
private int
j
private int
k
private int
k0
private boolean
dirty
private static final int
INC
private static final int
EXTRA
Constructors Summary
public PorterStemmer()


    
    b = new char[INC];
    i = 0;
  
Methods Summary
public voidadd(char ch)
Add a character to the word being stemmed. When you are finished adding characters, you can call stem(void) to process the word.

    if (b.length <= i + EXTRA) {
      char[] new_b = new char[b.length+INC];
      for (int c = 0; c < b.length; c++)
        new_b[c] = b[c];
      b = new_b;
    }
    b[i++] = ch;
  
private final booleancons(int i)

    switch (b[i]) {
    case 'a": case 'e": case 'i": case 'o": case 'u":
      return false;
    case 'y":
      return (i==k0) ? true : !cons(i-1);
    default:
      return true;
    }
  
private final booleancvc(int i)

    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
      return false;
    else {
      int ch = b[i];
      if (ch == 'w" || ch == 'x" || ch == 'y") return false;
    }
    return true;
  
private final booleandoublec(int j)

    if (j < k0+1)
      return false;
    if (b[j] != b[j-1])
      return false;
    return cons(j);
  
private final booleanends(java.lang.String s)

    int l = s.length();
    int o = k-l+1;
    if (o < k0)
      return false;
    for (int i = 0; i < l; i++)
      if (b[o+i] != s.charAt(i))
        return false;
    j = k-l;
    return true;
  
public char[]getResultBuffer()
Returns a reference to a character buffer containing the results of the stemming process. You also need to consult getResultLength() to determine the length of the result.

 return b; 
public intgetResultLength()
Returns the length of the word resulting from the stemming process.

 return i; 
private final intm()

    int n = 0;
    int i = k0;
    while(true) {
      if (i > j)
        return n;
      if (! cons(i))
        break;
      i++;
    }
    i++;
    while(true) {
      while(true) {
        if (i > j)
          return n;
        if (cons(i))
          break;
        i++;
      }
      i++;
      n++;
      while(true) {
        if (i > j)
          return n;
        if (! cons(i))
          break;
        i++;
      }
      i++;
    }
  
public static voidmain(java.lang.String[] args)
Test program for demonstrating the Stemmer. It reads a file and stems each word, writing the result to standard out. Usage: Stemmer file-name

    PorterStemmer s = new PorterStemmer();

    for (int i = 0; i < args.length; i++) {
      try {
        InputStream in = new FileInputStream(args[i]);
        byte[] buffer = new byte[1024];
        int bufferLen, offset, ch;

        bufferLen = in.read(buffer);
        offset = 0;
        s.reset();

        while(true) {
          if (offset < bufferLen)
            ch = buffer[offset++];
          else {
            bufferLen = in.read(buffer);
            offset = 0;
            if (bufferLen < 0)
              ch = -1;
            else
              ch = buffer[offset++];
          }

          if (Character.isLetter((char) ch)) {
            s.add(Character.toLowerCase((char) ch));
          }
          else {
             s.stem();
             System.out.print(s.toString());
             s.reset();
             if (ch < 0)
               break;
             else {
               System.out.print((char) ch);
             }
           }
        }

        in.close();
      }
      catch (IOException e) {
        System.out.println("error reading " + args[i]);
      }
    }
  
voidr(java.lang.String s)

 if (m() > 0) setto(s); 
public voidreset()
reset() resets the stemmer so it can stem another word. If you invoke the stemmer by calling add(char) and then stem(), you must call reset() before starting another word.

 i = 0; dirty = false; 
voidsetto(java.lang.String s)

    int l = s.length();
    int o = j+1;
    for (int i = 0; i < l; i++)
      b[o+i] = s.charAt(i);
    k = j+l;
    dirty = true;
  
public java.lang.Stringstem(java.lang.String s)
Stem a word provided as a String. Returns the result as a String.

    if (stem(s.toCharArray(), s.length()))
      return toString();
    else
      return s;
  
public booleanstem(char[] word)
Stem a word contained in a char[]. Returns true if the stemming process resulted in a word different from the input. You can retrieve the result with getResultLength()/getResultBuffer() or toString().

    return stem(word, word.length);
  
public booleanstem(char[] wordBuffer, int offset, int wordLen)
Stem a word contained in a portion of a char[] array. Returns true if the stemming process resulted in a word different from the input. You can retrieve the result with getResultLength()/getResultBuffer() or toString().

    reset();
    if (b.length < wordLen) {
      char[] new_b = new char[wordLen + EXTRA];
      b = new_b;
    }
    for (int j=0; j<wordLen; j++)
      b[j] = wordBuffer[offset+j];
    i = wordLen;
    return stem(0);
  
public booleanstem(char[] word, int wordLen)
Stem a word contained in a leading portion of a char[] array. Returns true if the stemming process resulted in a word different from the input. You can retrieve the result with getResultLength()/getResultBuffer() or toString().

    return stem(word, 0, wordLen);
  
public booleanstem()
Stem the word placed into the Stemmer buffer through calls to add(). Returns true if the stemming process resulted in a word different from the input. You can retrieve the result with getResultLength()/getResultBuffer() or toString().

    return stem(0);
  
public booleanstem(int i0)

    k = i - 1;
    k0 = i0;
    if (k > k0+1) {
      step1(); step2(); step3(); step4(); step5(); step6();
    }
    // Also, a word is considered dirty if we lopped off letters
    // Thanks to Ifigenia Vairelles for pointing this out.
    if (i != k+1)
      dirty = true;
    i = k+1;
    return dirty;
  
private final voidstep1()

    if (b[k] == 's") {
      if (ends("sses")) k -= 2;
      else if (ends("ies")) setto("i");
      else if (b[k-1] != 's") k--;
    }
    if (ends("eed")) {
      if (m() > 0)
        k--;
    }
    else if ((ends("ed") || ends("ing")) && vowelinstem()) {
      k = j;
      if (ends("at")) setto("ate");
      else if (ends("bl")) setto("ble");
      else if (ends("iz")) setto("ize");
      else if (doublec(k)) {
        int ch = b[k--];
        if (ch == 'l" || ch == 's" || ch == 'z")
          k++;
      }
      else if (m() == 1 && cvc(k))
        setto("e");
    }
  
private final voidstep2()

    if (ends("y") && vowelinstem()) {
      b[k] = 'i";
      dirty = true;
    }
  
private final voidstep3()

    if (k == k0) return; /* For Bug 1 */
    switch (b[k-1]) {
    case 'a":
      if (ends("ational")) { r("ate"); break; }
      if (ends("tional")) { r("tion"); break; }
      break;
    case 'c":
      if (ends("enci")) { r("ence"); break; }
      if (ends("anci")) { r("ance"); break; }
      break;
    case 'e":
      if (ends("izer")) { r("ize"); break; }
      break;
    case 'l":
      if (ends("bli")) { r("ble"); break; }
      if (ends("alli")) { r("al"); break; }
      if (ends("entli")) { r("ent"); break; }
      if (ends("eli")) { r("e"); break; }
      if (ends("ousli")) { r("ous"); break; }
      break;
    case 'o":
      if (ends("ization")) { r("ize"); break; }
      if (ends("ation")) { r("ate"); break; }
      if (ends("ator")) { r("ate"); break; }
      break;
    case 's":
      if (ends("alism")) { r("al"); break; }
      if (ends("iveness")) { r("ive"); break; }
      if (ends("fulness")) { r("ful"); break; }
      if (ends("ousness")) { r("ous"); break; }
      break;
    case 't":
      if (ends("aliti")) { r("al"); break; }
      if (ends("iviti")) { r("ive"); break; }
      if (ends("biliti")) { r("ble"); break; }
      break;
    case 'g":
      if (ends("logi")) { r("log"); break; }
    }
  
private final voidstep4()

    switch (b[k]) {
    case 'e":
      if (ends("icate")) { r("ic"); break; }
      if (ends("ative")) { r(""); break; }
      if (ends("alize")) { r("al"); break; }
      break;
    case 'i":
      if (ends("iciti")) { r("ic"); break; }
      break;
    case 'l":
      if (ends("ical")) { r("ic"); break; }
      if (ends("ful")) { r(""); break; }
      break;
    case 's":
      if (ends("ness")) { r(""); break; }
      break;
    }
  
private final voidstep5()

    if (k == k0) return; /* for Bug 1 */
    switch (b[k-1]) {
    case 'a":
      if (ends("al")) break;
      return;
    case 'c":
      if (ends("ance")) break;
      if (ends("ence")) break;
      return;
    case 'e":
      if (ends("er")) break; return;
    case 'i":
      if (ends("ic")) break; return;
    case 'l":
      if (ends("able")) break;
      if (ends("ible")) break; return;
    case 'n":
      if (ends("ant")) break;
      if (ends("ement")) break;
      if (ends("ment")) break;
      /* element etc. not stripped before the m */
      if (ends("ent")) break;
      return;
    case 'o":
      if (ends("ion") && j >= 0 && (b[j] == 's" || b[j] == 't")) break;
      /* j >= 0 fixes Bug 2 */
      if (ends("ou")) break;
      return;
      /* takes care of -ous */
    case 's":
      if (ends("ism")) break;
      return;
    case 't":
      if (ends("ate")) break;
      if (ends("iti")) break;
      return;
    case 'u":
      if (ends("ous")) break;
      return;
    case 'v":
      if (ends("ive")) break;
      return;
    case 'z":
      if (ends("ize")) break;
      return;
    default:
      return;
    }
    if (m() > 1)
      k = j;
  
private final voidstep6()

    j = k;
    if (b[k] == 'e") {
      int a = m();
      if (a > 1 || a == 1 && !cvc(k-1))
        k--;
    }
    if (b[k] == 'l" && doublec(k) && m() > 1)
      k--;
  
public java.lang.StringtoString()
After a word has been stemmed, it can be retrieved by toString(), or a reference to the internal buffer can be retrieved by getResultBuffer and getResultLength (which is generally more efficient.)

 return new String(b,0,i); 
private final booleanvowelinstem()

    int i;
    for (i = k0; i <= j; i++)
      if (! cons(i))
        return true;
    return false;