File Doc Category Size Date Package
GreekCharsets.java API Doc Apache Lucene 1.9 12844 Mon Feb 20 09:18:48 GMT 2006 org.apache.lucene.analysis.el

GreekCharsets

java.lang.Object

public class GreekCharsets extends Object

GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253. Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters, including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding the definition of a new charset as well as the required logic in the toLowerCase() method.

author: Panagiotis Astithas, past@ebs.gr

Fields Summary
public static char[]
UnicodeGreek
public static char[]
ISO
public static char[]
CP1253
Constructors Summary
Methods Summary
public static char toLowerCase(char letter, char[] charset)
if (charset == UnicodeGreek) { // First deal with lower case, not accented letters if (letter >= '\u03B1" && letter <= '\u03C9") { // Special case 'small final sigma', where we return 'small sigma' if (letter == '\u03C2") { return '\u03C3"; } else { return letter; } } // Then deal with lower case, accented letters // alpha with acute if (letter == '\u03AC") { return '\u03B1"; } // epsilon with acute if (letter == '\u03AD") { return '\u03B5"; } // eta with acute if (letter == '\u03AE") { return '\u03B7"; } // iota with acute, iota with diaeresis, iota with acute and diaeresis if (letter == '\u03AF" || letter == '\u03CA" || letter == '\u0390") { return '\u03B9"; } // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis if (letter == '\u03CD" || letter == '\u03CB" || letter == '\u03B0") { return '\u03C5"; } // omicron with acute if (letter == '\u03CC") { return '\u03BF"; } // omega with acute if (letter == '\u03CE") { return '\u03C9"; } // After that, deal with upper case, not accented letters if (letter >= '\u0391" && letter <= '\u03A9") { return (char) (letter + 32); } // Finally deal with upper case, accented letters // alpha with acute if (letter == '\u0386") { return '\u03B1"; } // epsilon with acute if (letter == '\u0388") { return '\u03B5"; } // eta with acute if (letter == '\u0389") { return '\u03B7"; } // iota with acute, iota with diaeresis if (letter == '\u038A" || letter == '\u03AA") { return '\u03B9"; } // upsilon with acute, upsilon with diaeresis if (letter == '\u038E" || letter == '\u03AB") { return '\u03C5"; } // omicron with acute if (letter == '\u038C") { return '\u03BF"; } // omega with acute if (letter == '\u038F") { return '\u03C9"; } } else if (charset == ISO) { // First deal with lower case, not accented letters if (letter >= 0xe1 && letter <= 0xf9) { // Special case 'small final sigma', where we return 'small sigma' if (letter == 0xf2) { return 0xf3; } else { return letter; } } // Then deal with lower case, accented letters // alpha with acute if (letter == 0xdc) { return 0xe1; } // epsilon with acute if (letter == 0xdd) { return 0xe5; } // eta with acute if (letter == 0xde) { return 0xe7; } // iota with acute, iota with diaeresis, iota with acute and diaeresis if (letter == 0xdf || letter == 0xfa || letter == 0xc0) { return '\u03B9"; } // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis if (letter == 0xfd || letter == 0xfb || letter == 0xe0) { return 0xf5; } // omicron with acute if (letter == 0xfc) { return 0xef; } // omega with acute if (letter == 0xfe) { return 0xf9; } // After that, deal with upper case, not accented letters if (letter >= 0xc1 && letter <= 0xd9) { return (char) (letter + 32); } // Finally deal with upper case, accented letters // alpha with acute if (letter == 0xb6) { return 0xe1; } // epsilon with acute if (letter == 0xb8) { return 0xe5; } // eta with acute if (letter == 0xb9) { return 0xe7; } // iota with acute, iota with diaeresis if (letter == 0xba || letter == 0xda) { return 0xe9; } // upsilon with acute, upsilon with diaeresis if (letter == 0xbe || letter == 0xdb) { return 0xf5; } // omicron with acute if (letter == 0xbc) { return 0xef; } // omega with acute if (letter == 0xbf) { return 0xf9; } } else if (charset == CP1253) { // First deal with lower case, not accented letters if (letter >= 0xe1 && letter <= 0xf9) { // Special case 'small final sigma', where we return 'small sigma' if (letter == 0xf2) { return 0xf3; } else { return letter; } } // Then deal with lower case, accented letters // alpha with acute if (letter == 0xdc) { return 0xe1; } // epsilon with acute if (letter == 0xdd) { return 0xe5; } // eta with acute if (letter == 0xde) { return 0xe7; } // iota with acute, iota with diaeresis, iota with acute and diaeresis if (letter == 0xdf || letter == 0xfa || letter == 0xc0) { return '\u03B9"; } // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis if (letter == 0xfd || letter == 0xfb || letter == 0xe0) { return 0xf5; } // omicron with acute if (letter == 0xfc) { return 0xef; } // omega with acute if (letter == 0xfe) { return 0xf9; } // After that, deal with upper case, not accented letters if (letter >= 0xc1 && letter <= 0xd9) { return (char) (letter + 32); } // Finally deal with upper case, accented letters // alpha with acute if (letter == 0xa2) { return 0xe1; } // epsilon with acute if (letter == 0xb8) { return 0xe5; } // eta with acute if (letter == 0xb9) { return 0xe7; } // iota with acute, iota with diaeresis if (letter == 0xba || letter == 0xda) { return 0xe9; } // upsilon with acute, upsilon with diaeresis if (letter == 0xbe || letter == 0xdb) { return 0xf5; } // omicron with acute if (letter == 0xbc) { return 0xef; } // omega with acute if (letter == 0xbf) { return 0xf9; } } return Character.toLowerCase(letter);