FileDocCategorySizeDatePackage
Metaphone.javaAPI DocAndroid 1.5 API14223Wed May 06 22:41:10 BST 2009org.apache.commons.codec.language

Metaphone

public class Metaphone extends Object implements StringEncoder
Encodes a string into a metaphone value.

Initial Java implementation by William B. Brogden. December, 1997. Permission given by wbrogden for code to be used anywhere.

Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p 39.

author
Apache Software Foundation
version
$Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $

Fields Summary
private String
vowels
Five values in the English language
private String
frontv
Variable used in Metaphone algorithm
private String
varson
Variable used in Metaphone algorithm
private int
maxCodeLen
The max code length for metaphone is 4
Constructors Summary
public Metaphone()
Creates an instance of the Metaphone encoder


                
      
        super();
    
Methods Summary
public java.lang.Objectencode(java.lang.Object pObject)
Encodes an Object using the metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.

param
pObject Object to encode
return
An object (or type java.lang.String) containing the metaphone code which corresponds to the String supplied.
throws
EncoderException if the parameter supplied is not of type java.lang.String

        if (!(pObject instanceof java.lang.String)) {
            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 
        }
        return metaphone((String) pObject);
    
public java.lang.Stringencode(java.lang.String pString)
Encodes a String using the Metaphone algorithm.

param
pString String object to encode
return
The metaphone code corresponding to the String supplied

        return metaphone(pString);   
    
public intgetMaxCodeLen()
Returns the maxCodeLen.

return
int

 return this.maxCodeLen; 
private booleanisLastChar(int wdsz, int n)

        return n + 1 == wdsz;
    
public booleanisMetaphoneEqual(java.lang.String str1, java.lang.String str2)
Tests is the metaphones of two strings are identical.

param
str1 First of two strings to compare
param
str2 Second of two strings to compare
return
true if the metaphones of these strings are identical, false otherwise.

        return metaphone(str1).equals(metaphone(str2));
    
private booleanisNextChar(java.lang.StringBuffer string, int index, char c)

        boolean matches = false;
        if( index >= 0 &&
            index < string.length() - 1 ) {
            matches = string.charAt(index + 1) == c;
        }
        return matches;
    
private booleanisPreviousChar(java.lang.StringBuffer string, int index, char c)

        boolean matches = false;
        if( index > 0 &&
            index < string.length() ) {
            matches = string.charAt(index - 1) == c;
        }
        return matches;
    
private booleanisVowel(java.lang.StringBuffer string, int index)

        return (this.vowels.indexOf(string.charAt(index)) >= 0);
    
public java.lang.Stringmetaphone(java.lang.String txt)
Find the metaphone value of a String. This is similar to the soundex algorithm, but better at finding similar sounding words. All input is converted to upper case. Limitations: Input format is expected to be a single ASCII word with only characters in the A - Z range, no punctuation or numbers.

param
txt String to find the metaphone code for
return
A metaphone code corresponding to the String supplied

        boolean hard = false ;
        if ((txt == null) || (txt.length() == 0)) {
            return "" ;
        }
        // single character is itself
        if (txt.length() == 1) {
            return txt.toUpperCase() ;
        }
      
        char[] inwd = txt.toUpperCase().toCharArray() ;
      
        StringBuffer local = new StringBuffer(40); // manipulate
        StringBuffer code = new StringBuffer(10) ; //   output
        // handle initial 2 characters exceptions
        switch(inwd[0]) {
        case 'K" : 
        case 'G" : 
        case 'P" : /* looking for KN, etc*/
            if (inwd[1] == 'N") {
                local.append(inwd, 1, inwd.length - 1);
            } else {
                local.append(inwd);
            }
            break;
        case 'A": /* looking for AE */
            if (inwd[1] == 'E") {
                local.append(inwd, 1, inwd.length - 1);
            } else {
                local.append(inwd);
            }
            break;
        case 'W" : /* looking for WR or WH */
            if (inwd[1] == 'R") {   // WR -> R
                local.append(inwd, 1, inwd.length - 1); 
                break ;
            }
            if (inwd[1] == 'H") {
                local.append(inwd, 1, inwd.length - 1);
                local.setCharAt(0, 'W"); // WH -> W
            } else {
                local.append(inwd);
            }
            break;
        case 'X" : /* initial X becomes S */
            inwd[0] = 'S";
            local.append(inwd);
            break ;
        default :
            local.append(inwd);
        } // now local has working string with initials fixed

        int wdsz = local.length();
        int n = 0 ;

        while ((code.length() < this.getMaxCodeLen()) && 
               (n < wdsz) ) { // max code size of 4 works well
            char symb = local.charAt(n) ;
            // remove duplicate letters except C
            if ((symb != 'C") && (isPreviousChar( local, n, symb )) ) {
                n++ ;
            } else { // not dup
                switch(symb) {
                case 'A" : case 'E" : case 'I" : case 'O" : case 'U" :
                    if (n == 0) { 
                        code.append(symb);
                    }
                    break ; // only use vowel if leading char
                case 'B" :
                    if ( isPreviousChar(local, n, 'M") && 
                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
                        break;
                    }
                    code.append(symb);
                    break;
                case 'C" : // lots of C special cases
                    /* discard if SCI, SCE or SCY */
                    if ( isPreviousChar(local, n, 'S") && 
                         !isLastChar(wdsz, n) && 
                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { 
                        break;
                    }
                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
                        code.append('X"); 
                        break;
                    }
                    if (!isLastChar(wdsz, n) && 
                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
                        code.append('S");
                        break; // CI,CE,CY -> S
                    }
                    if (isPreviousChar(local, n, 'S") &&
                        isNextChar(local, n, 'H") ) { // SCH->sk
                        code.append('K") ; 
                        break ;
                    }
                    if (isNextChar(local, n, 'H")) { // detect CH
                        if ((n == 0) && 
                            (wdsz >= 3) && 
                            isVowel(local,2) ) { // CH consonant -> K consonant
                            code.append('K");
                        } else { 
                            code.append('X"); // CHvowel -> X
                        }
                    } else { 
                        code.append('K");
                    }
                    break ;
                case 'D" :
                    if (!isLastChar(wdsz, n + 1) && 
                        isNextChar(local, n, 'G") && 
                        (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
                        code.append('J"); n += 2 ;
                    } else { 
                        code.append('T");
                    }
                    break ;
                case 'G" : // GH silent at end or before consonant
                    if (isLastChar(wdsz, n + 1) && 
                        isNextChar(local, n, 'H")) {
                        break;
                    }
                    if (!isLastChar(wdsz, n + 1) &&  
                        isNextChar(local,n,'H") && 
                        !isVowel(local,n+2)) {
                        break;
                    }
                    if ((n > 0) && 
                        ( regionMatch(local, n, "GN") ||
                          regionMatch(local, n, "GNED") ) ) {
                        break; // silent G
                    }
                    if (isPreviousChar(local, n, 'G")) {
                        hard = true ;
                    } else {
                        hard = false ;
                    }
                    if (!isLastChar(wdsz, n) && 
                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && 
                        (!hard)) {
                        code.append('J");
                    } else {
                        code.append('K");
                    }
                    break ;
                case 'H":
                    if (isLastChar(wdsz, n)) {
                        break ; // terminal H
                    }
                    if ((n > 0) && 
                        (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
                        break;
                    }
                    if (isVowel(local,n+1)) {
                        code.append('H"); // Hvowel
                    }
                    break;
                case 'F": 
                case 'J" : 
                case 'L" :
                case 'M": 
                case 'N" : 
                case 'R" :
                    code.append(symb); 
                    break;
                case 'K" :
                    if (n > 0) { // not initial
                        if (!isPreviousChar(local, n, 'C")) {
                            code.append(symb);
                        }
                    } else {
                        code.append(symb); // initial K
                    }
                    break ;
                case 'P" :
                    if (isNextChar(local,n,'H")) {
                        // PH -> F
                        code.append('F");
                    } else {
                        code.append(symb);
                    }
                    break ;
                case 'Q" :
                    code.append('K");
                    break;
                case 'S" :
                    if (regionMatch(local,n,"SH") || 
                        regionMatch(local,n,"SIO") || 
                        regionMatch(local,n,"SIA")) {
                        code.append('X");
                    } else {
                        code.append('S");
                    }
                    break;
                case 'T" :
                    if (regionMatch(local,n,"TIA") || 
                        regionMatch(local,n,"TIO")) {
                        code.append('X"); 
                        break;
                    }
                    if (regionMatch(local,n,"TCH")) {
                        // Silent if in "TCH"
                        break;
                    }
                    // substitute numeral 0 for TH (resembles theta after all)
                    if (regionMatch(local,n,"TH")) {
                        code.append('0");
                    } else {
                        code.append('T");
                    }
                    break ;
                case 'V" :
                    code.append('F"); break ;
                case 'W" : case 'Y" : // silent if not followed by vowel
                    if (!isLastChar(wdsz,n) && 
                        isVowel(local,n+1)) {
                        code.append(symb);
                    }
                    break ;
                case 'X" :
                    code.append('K"); code.append('S");
                    break ;
                case 'Z" :
                    code.append('S"); break ;
                } // end switch
                n++ ;
            } // end else from symb != 'C'
            if (code.length() > this.getMaxCodeLen()) { 
                code.setLength(this.getMaxCodeLen()); 
            }
        }
        return code.toString();
    
private booleanregionMatch(java.lang.StringBuffer string, int index, java.lang.String test)

        boolean matches = false;
        if( index >= 0 &&
            (index + test.length() - 1) < string.length() ) {
            String substring = string.substring( index, index + test.length());
            matches = substring.equals( test );
        }
        return matches;
    
public voidsetMaxCodeLen(int maxCodeLen)
Sets the maxCodeLen.

param
maxCodeLen The maxCodeLen to set

 this.maxCodeLen = maxCodeLen;