FileDocCategorySizeDatePackage
GreekCharsets.javaAPI DocApache Lucene 1.912844Mon Feb 20 09:18:48 GMT 2006org.apache.lucene.analysis.el

GreekCharsets

public class GreekCharsets extends Object
GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253. Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters, including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding the definition of a new charset as well as the required logic in the toLowerCase() method.
author
Panagiotis Astithas, past@ebs.gr

Fields Summary
public static char[]
UnicodeGreek
public static char[]
ISO
public static char[]
CP1253
Constructors Summary
Methods Summary
public static chartoLowerCase(char letter, char[] charset)


          
    
        if (charset == UnicodeGreek) {
        	// First deal with lower case, not accented letters
            if (letter >= '\u03B1" && letter <= '\u03C9")
            {
            	// Special case 'small final sigma', where we return 'small sigma'
                if (letter == '\u03C2") {
                	return '\u03C3";
                } else {
                	return letter;
                }
            }
            // Then deal with lower case, accented letters
            // alpha with acute
            if (letter == '\u03AC") {
            	return '\u03B1";
            }
            // epsilon with acute
            if (letter == '\u03AD") {
            	return '\u03B5";
            }
            // eta with acute
            if (letter == '\u03AE") {
            	return '\u03B7";
            }
            // iota with acute, iota with diaeresis, iota with acute and diaeresis
            if (letter == '\u03AF" || letter == '\u03CA" || letter == '\u0390") {
            	return '\u03B9";
            }
            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
            if (letter == '\u03CD" || letter == '\u03CB" || letter == '\u03B0") {
            	return '\u03C5";
            }
            // omicron with acute
            if (letter == '\u03CC") {
            	return '\u03BF";
            }
            // omega with acute
            if (letter == '\u03CE") {
            	return '\u03C9";
            }
            // After that, deal with upper case, not accented letters
            if (letter >= '\u0391" && letter <= '\u03A9")
            {
                return (char) (letter + 32);
            }
            // Finally deal with upper case, accented letters
            // alpha with acute
            if (letter == '\u0386") {
            	return '\u03B1";
            }
            // epsilon with acute
            if (letter == '\u0388") {
            	return '\u03B5";
            }
            // eta with acute
            if (letter == '\u0389") {
            	return '\u03B7";
            }
            // iota with acute, iota with diaeresis
            if (letter == '\u038A" || letter == '\u03AA") {
            	return '\u03B9";
            }
            // upsilon with acute, upsilon with diaeresis
            if (letter == '\u038E" || letter == '\u03AB") {
            	return '\u03C5";
            }
            // omicron with acute
            if (letter == '\u038C") {
            	return '\u03BF";
            }
            // omega with acute
            if (letter == '\u038F") {
            	return '\u03C9";
            }
        } else if (charset == ISO) {
        	// First deal with lower case, not accented letters
            if (letter >= 0xe1 && letter <= 0xf9)
            {
            	// Special case 'small final sigma', where we return 'small sigma'
                if (letter == 0xf2) {
                	return 0xf3;
                } else {
                	return letter;
                }
            }
            // Then deal with lower case, accented letters
            // alpha with acute
            if (letter == 0xdc) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xdd) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xde) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis, iota with acute and diaeresis
            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
            	return '\u03B9";
            }
            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xfc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xfe) {
            	return 0xf9;
            }
            // After that, deal with upper case, not accented letters
            if (letter >= 0xc1 && letter <= 0xd9) {
                return (char) (letter + 32);
            }
            // Finally deal with upper case, accented letters
            // alpha with acute
            if (letter == 0xb6) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xb8) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xb9) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis
            if (letter == 0xba || letter == 0xda) {
            	return 0xe9;
            }
            // upsilon with acute, upsilon with diaeresis
            if (letter == 0xbe || letter == 0xdb) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xbc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xbf) {
            	return 0xf9;
            }
        } else if (charset == CP1253) {
        	// First deal with lower case, not accented letters
            if (letter >= 0xe1 && letter <= 0xf9)
            {
            	// Special case 'small final sigma', where we return 'small sigma'
                if (letter == 0xf2) {
                	return 0xf3;
                } else {
                	return letter;
                }
            }
            // Then deal with lower case, accented letters
            // alpha with acute
            if (letter == 0xdc) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xdd) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xde) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis, iota with acute and diaeresis
            if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
            	return '\u03B9";
            }
            // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
            if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xfc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xfe) {
            	return 0xf9;
            }
            // After that, deal with upper case, not accented letters
            if (letter >= 0xc1 && letter <= 0xd9) {
                return (char) (letter + 32);
            }
            // Finally deal with upper case, accented letters
            // alpha with acute
            if (letter == 0xa2) {
            	return 0xe1;
            }
            // epsilon with acute
            if (letter == 0xb8) {
            	return 0xe5;
            }
            // eta with acute
            if (letter == 0xb9) {
            	return 0xe7;
            }
            // iota with acute, iota with diaeresis
            if (letter == 0xba || letter == 0xda) {
            	return 0xe9;
            }
            // upsilon with acute, upsilon with diaeresis
            if (letter == 0xbe || letter == 0xdb) {
            	return 0xf5;
            }
            // omicron with acute
            if (letter == 0xbc) {
            	return 0xef;
            }
            // omega with acute
            if (letter == 0xbf) {
            	return 0xf9;
            }
        }

        return Character.toLowerCase(letter);