FileDocCategorySizeDatePackage
GreekAnalyzer.javaAPI DocApache Lucene 1.95962Mon Feb 20 09:18:48 GMT 2006org.apache.lucene.analysis.el

GreekAnalyzer.java

package org.apache.lucene.analysis.el;

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import java.io.Reader;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;

/**
 * Analyzer for the Greek language. Supports an external list of stopwords (words
 * that will not be indexed at all).
 * A default set of stopwords is used unless an alternative list is specified.
 *
 * @author  Panagiotis Astithas, past@ebs.gr
 */
public final class GreekAnalyzer extends Analyzer
{
    // the letters are indexes to the charset array (see GreekCharsets.java)
    private static char A = 6;
    private static char B = 7;
    private static char G = 8;
    private static char D = 9;
    private static char E = 10;
    private static char Z = 11;
    private static char H = 12;
    private static char TH = 13;
    private static char I = 14;
    private static char K = 15;
    private static char L = 16;
    private static char M = 17;
    private static char N = 18;
    private static char KS = 19;
    private static char O = 20;
    private static char P = 21;
    private static char R = 22;
    private static char S = 24;	// skip final sigma
    private static char T = 25;
    private static char Y = 26;
    private static char F = 27;
    private static char X = 28;
    private static char PS = 29;
    private static char W = 30;

    /**
     * List of typical Greek stopwords.
     */
    private static char[][] GREEK_STOP_WORDS = {
        {O},
		{H},
		{T, O},
        {O, I},
		{T, A},
		{T, O, Y},
		{T, H, S},
		{T, W, N},
		{T, O, N},
		{T, H, N},
		{K, A, I},
		{K, I},
		{K},
		{E, I, M, A, I},
		{E, I, S, A, I},
		{E, I, N, A, I},
		{E, I, M, A, S, T, E},
		{E, I, S, T, E},
		{S, T, O},
		{S, T, O, N},
		{S, T, H},
		{S, T, H, N},
		{M, A},
		{A, L, L, A},
		{A, P, O},
		{G, I, A},
		{P, R, O, S},
		{M, E},
		{S, E},
		{W, S},
		{P, A, R, A},
		{A, N, T, I},
		{K, A, T, A},
		{M, E, T, A},
		{TH, A},
		{N, A},
		{D, E},
		{D, E, N},
		{M, H},
		{M, H, N},
		{E, P, I},
		{E, N, W},
		{E, A, N},
		{A, N},
		{T, O, T, E},
		{P, O, Y},
		{P, W, S},
		{P, O, I, O, S},
		{P, O, I, A},
		{P, O, I, O},
		{P, O, I, O, I},
		{P, O, I, E, S},
		{P, O, I, W, N},
		{P, O, I, O, Y, S},
		{A, Y, T, O, S},
		{A, Y, T, H},
		{A, Y, T, O},
		{A, Y, T, O, I},
		{A, Y, T, W, N},
		{A, Y, T, O, Y, S},
		{A, Y, T, E, S},
		{A, Y, T, A},
		{E, K, E, I, N, O, S},
		{E, K, E, I, N, H},
		{E, K, E, I, N, O},
		{E, K, E, I, N, O, I},
		{E, K, E, I, N, E, S},
		{E, K, E, I, N, A},
		{E, K, E, I, N, W, N},
		{E, K, E, I, N, O, Y, S},
		{O, P, W, S},
		{O, M, W, S},
		{I, S, W, S},
		{O, S, O},
		{O, T, I}
    };

    /**
     * Contains the stopwords used with the StopFilter.
     */
    private Set stopSet = new HashSet();

    /**
     * Charset for Greek letters.
     * Represents encoding for 24 lowercase Greek letters.
     * Predefined charsets can be taken from GreekCharSets class
     */
    private char[] charset;

    public GreekAnalyzer() {
        charset = GreekCharsets.UnicodeGreek;
        stopSet = StopFilter.makeStopSet(
                    makeStopWords(GreekCharsets.UnicodeGreek));
    }

    /**
     * Builds an analyzer.
     */
    public GreekAnalyzer(char[] charset)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
    }

    /**
     * Builds an analyzer with the given stop words.
     */
    public GreekAnalyzer(char[] charset, String[] stopwords)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(stopwords);
    }

    // Takes greek stop words and translates them to a String array, using
    // the given charset
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[GREEK_STOP_WORDS.length];
        for (int i = 0; i < res.length; i++)
        {
            char[] theStopWord = GREEK_STOP_WORDS[i];
            // translate the word,using the charset
            StringBuffer theWord = new StringBuffer();
            for (int j = 0; j < theStopWord.length; j++)
            {
                theWord.append(charset[theStopWord[j]]);
            }
            res[i] = theWord.toString();
        }
        return res;
    }

    /**
     * Builds an analyzer with the given stop words.
     */
    public GreekAnalyzer(char[] charset, Hashtable stopwords)
    {
        this.charset = charset;
        stopSet = new HashSet(stopwords.keySet());
    }

    /**
     * Creates a TokenStream which tokenizes all the text in the provided Reader.
     *
     * @return  A TokenStream build from a StandardTokenizer filtered with
     *                  GreekLowerCaseFilter and StopFilter
     */
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
    	TokenStream result = new StandardTokenizer(reader);
        result = new GreekLowerCaseFilter(result, charset);
        result = new StopFilter(result, stopSet);
        return result;
    }
}