Metaphonepublic class Metaphone extends Object implements StringEncoderEncodes a string into a metaphone value.
Initial Java implementation by William B. Brogden. December, 1997.
Permission given by wbrogden for code to be used anywhere.
Hanging on the Metaphone by Lawrence Philips in Computer Language of Dec. 1990, p
39.
|
Fields Summary |
---|
private String | vowelsFive values in the English language | private String | frontvVariable used in Metaphone algorithm | private String | varsonVariable used in Metaphone algorithm | private int | maxCodeLenThe max code length for metaphone is 4 |
Constructors Summary |
---|
public Metaphone()Creates an instance of the Metaphone encoder
super();
|
Methods Summary |
---|
public java.lang.Object | encode(java.lang.Object pObject)Encodes an Object using the metaphone algorithm. This method
is provided in order to satisfy the requirements of the
Encoder interface, and will throw an EncoderException if the
supplied object is not of type java.lang.String.
if (!(pObject instanceof java.lang.String)) {
throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
}
return metaphone((String) pObject);
| public java.lang.String | encode(java.lang.String pString)Encodes a String using the Metaphone algorithm.
return metaphone(pString);
| public int | getMaxCodeLen()Returns the maxCodeLen. return this.maxCodeLen;
| private boolean | isLastChar(int wdsz, int n)
return n + 1 == wdsz;
| public boolean | isMetaphoneEqual(java.lang.String str1, java.lang.String str2)Tests is the metaphones of two strings are identical.
return metaphone(str1).equals(metaphone(str2));
| private boolean | isNextChar(java.lang.StringBuffer string, int index, char c)
boolean matches = false;
if( index >= 0 &&
index < string.length() - 1 ) {
matches = string.charAt(index + 1) == c;
}
return matches;
| private boolean | isPreviousChar(java.lang.StringBuffer string, int index, char c)
boolean matches = false;
if( index > 0 &&
index < string.length() ) {
matches = string.charAt(index - 1) == c;
}
return matches;
| private boolean | isVowel(java.lang.StringBuffer string, int index)
return (this.vowels.indexOf(string.charAt(index)) >= 0);
| public java.lang.String | metaphone(java.lang.String txt)Find the metaphone value of a String. This is similar to the
soundex algorithm, but better at finding similar sounding words.
All input is converted to upper case.
Limitations: Input format is expected to be a single ASCII word
with only characters in the A - Z range, no punctuation or numbers.
boolean hard = false ;
if ((txt == null) || (txt.length() == 0)) {
return "" ;
}
// single character is itself
if (txt.length() == 1) {
return txt.toUpperCase() ;
}
char[] inwd = txt.toUpperCase().toCharArray() ;
StringBuffer local = new StringBuffer(40); // manipulate
StringBuffer code = new StringBuffer(10) ; // output
// handle initial 2 characters exceptions
switch(inwd[0]) {
case 'K" :
case 'G" :
case 'P" : /* looking for KN, etc*/
if (inwd[1] == 'N") {
local.append(inwd, 1, inwd.length - 1);
} else {
local.append(inwd);
}
break;
case 'A": /* looking for AE */
if (inwd[1] == 'E") {
local.append(inwd, 1, inwd.length - 1);
} else {
local.append(inwd);
}
break;
case 'W" : /* looking for WR or WH */
if (inwd[1] == 'R") { // WR -> R
local.append(inwd, 1, inwd.length - 1);
break ;
}
if (inwd[1] == 'H") {
local.append(inwd, 1, inwd.length - 1);
local.setCharAt(0, 'W"); // WH -> W
} else {
local.append(inwd);
}
break;
case 'X" : /* initial X becomes S */
inwd[0] = 'S";
local.append(inwd);
break ;
default :
local.append(inwd);
} // now local has working string with initials fixed
int wdsz = local.length();
int n = 0 ;
while ((code.length() < this.getMaxCodeLen()) &&
(n < wdsz) ) { // max code size of 4 works well
char symb = local.charAt(n) ;
// remove duplicate letters except C
if ((symb != 'C") && (isPreviousChar( local, n, symb )) ) {
n++ ;
} else { // not dup
switch(symb) {
case 'A" : case 'E" : case 'I" : case 'O" : case 'U" :
if (n == 0) {
code.append(symb);
}
break ; // only use vowel if leading char
case 'B" :
if ( isPreviousChar(local, n, 'M") &&
isLastChar(wdsz, n) ) { // B is silent if word ends in MB
break;
}
code.append(symb);
break;
case 'C" : // lots of C special cases
/* discard if SCI, SCE or SCY */
if ( isPreviousChar(local, n, 'S") &&
!isLastChar(wdsz, n) &&
(this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
break;
}
if (regionMatch(local, n, "CIA")) { // "CIA" -> X
code.append('X");
break;
}
if (!isLastChar(wdsz, n) &&
(this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
code.append('S");
break; // CI,CE,CY -> S
}
if (isPreviousChar(local, n, 'S") &&
isNextChar(local, n, 'H") ) { // SCH->sk
code.append('K") ;
break ;
}
if (isNextChar(local, n, 'H")) { // detect CH
if ((n == 0) &&
(wdsz >= 3) &&
isVowel(local,2) ) { // CH consonant -> K consonant
code.append('K");
} else {
code.append('X"); // CHvowel -> X
}
} else {
code.append('K");
}
break ;
case 'D" :
if (!isLastChar(wdsz, n + 1) &&
isNextChar(local, n, 'G") &&
(this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
code.append('J"); n += 2 ;
} else {
code.append('T");
}
break ;
case 'G" : // GH silent at end or before consonant
if (isLastChar(wdsz, n + 1) &&
isNextChar(local, n, 'H")) {
break;
}
if (!isLastChar(wdsz, n + 1) &&
isNextChar(local,n,'H") &&
!isVowel(local,n+2)) {
break;
}
if ((n > 0) &&
( regionMatch(local, n, "GN") ||
regionMatch(local, n, "GNED") ) ) {
break; // silent G
}
if (isPreviousChar(local, n, 'G")) {
hard = true ;
} else {
hard = false ;
}
if (!isLastChar(wdsz, n) &&
(this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
(!hard)) {
code.append('J");
} else {
code.append('K");
}
break ;
case 'H":
if (isLastChar(wdsz, n)) {
break ; // terminal H
}
if ((n > 0) &&
(this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
break;
}
if (isVowel(local,n+1)) {
code.append('H"); // Hvowel
}
break;
case 'F":
case 'J" :
case 'L" :
case 'M":
case 'N" :
case 'R" :
code.append(symb);
break;
case 'K" :
if (n > 0) { // not initial
if (!isPreviousChar(local, n, 'C")) {
code.append(symb);
}
} else {
code.append(symb); // initial K
}
break ;
case 'P" :
if (isNextChar(local,n,'H")) {
// PH -> F
code.append('F");
} else {
code.append(symb);
}
break ;
case 'Q" :
code.append('K");
break;
case 'S" :
if (regionMatch(local,n,"SH") ||
regionMatch(local,n,"SIO") ||
regionMatch(local,n,"SIA")) {
code.append('X");
} else {
code.append('S");
}
break;
case 'T" :
if (regionMatch(local,n,"TIA") ||
regionMatch(local,n,"TIO")) {
code.append('X");
break;
}
if (regionMatch(local,n,"TCH")) {
// Silent if in "TCH"
break;
}
// substitute numeral 0 for TH (resembles theta after all)
if (regionMatch(local,n,"TH")) {
code.append('0");
} else {
code.append('T");
}
break ;
case 'V" :
code.append('F"); break ;
case 'W" : case 'Y" : // silent if not followed by vowel
if (!isLastChar(wdsz,n) &&
isVowel(local,n+1)) {
code.append(symb);
}
break ;
case 'X" :
code.append('K"); code.append('S");
break ;
case 'Z" :
code.append('S"); break ;
} // end switch
n++ ;
} // end else from symb != 'C'
if (code.length() > this.getMaxCodeLen()) {
code.setLength(this.getMaxCodeLen());
}
}
return code.toString();
| private boolean | regionMatch(java.lang.StringBuffer string, int index, java.lang.String test)
boolean matches = false;
if( index >= 0 &&
(index + test.length() - 1) < string.length() ) {
String substring = string.substring( index, index + test.length());
matches = substring.equals( test );
}
return matches;
| public void | setMaxCodeLen(int maxCodeLen)Sets the maxCodeLen. this.maxCodeLen = maxCodeLen;
|
|