FileDocCategorySizeDatePackage
StringNormalizer.javaAPI DocphoneME MR2 API (J2ME)7372Wed May 02 18:00:46 BST 2007com.sun.j2me.global

StringNormalizer.java

/*
 *   
 *
 * Copyright  1990-2007 Sun Microsystems, Inc. All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License version
 * 2 only, as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License version 2 for more details (a copy is
 * included at /legal/license.txt).
 * 
 * You should have received a copy of the GNU General Public License
 * version 2 along with this work; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA
 * 
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
 * Clara, CA 95054 or visit www.sun.com if you need additional
 * information or have any questions.
 */

package com.sun.j2me.global;

/**
 * A string normalizer is responsible for decomposing strings into their 
 * canonically decomposed equivalents (Normalization Form D). 
 */
public final class StringNormalizer implements StringDecomposer {
    /** The capacity increment value of the internal buffers. */
    private static final int CAPACITY_INCREMENT = 64;
    
    /** Internal decomposition buffer. */
    private int[] decomposition = new int[CAPACITY_INCREMENT];
    /** Decomposition offset. */
    private int decOffset;
    /** Decomposition length. */
    private int decLength;
    /** String offset. */
    private int strOffset;
    /** String length. */
    private int strLength;
    /** String initial offset. */
    private int strInitOffset;
    
    /** Max decomposition length. */
    private int maxDecomposition;
    
    /** 
     * The string being decomposed.
     */
    private String source;
    /**
     * A lookup table which is used during the normalization.
     */
    private NormalizationTable table;
    
    /** 
     * Creates a new instance of <code>StringNormalizer</code>.
     *
     * @param table a lookup table for the normalization
     */
    public StringNormalizer(NormalizationTable table) {
        this.table = table;       
        this.maxDecomposition = table.getMaxDecompositionLength();
    }
    
    /** 
     * Creates a new instance of <code>StringNormalizer</code>.
     *
     * @param s a string for the normaliztion
     * @param table a lookup table for the normalization
     */
    public StringNormalizer(String s, NormalizationTable table) {
        this(table);
        source = s;
        strLength = s.length();
    }

    /**
     * Sets the string for the normalization.
     *
     * @param s the string
     */
    public final void setSource(String s) {
        source = s;
        strLength = s.length();
        strInitOffset = 0;
        reset();        
    }

    /**
     * Sets the string for the normalization.
     *
     * @param s the string
     * @param offset the offset to start the normalization from
     */
    public final void setSource(String s, int offset) {
        source = s;
        strLength = s.length();
        strInitOffset = offset;
        reset();        
    }

    /**
     * Restarts the decomposition.
     */
    public final void reset() {
        decOffset = 0;
        decLength = 0;
        strOffset = strInitOffset;
    }
    
    /**
     * Returns the next code point value from the source string. It expects
     * the input string to be UTF-16 encoded.
     *
     * @return the next code point value
     */
    public final int nextUTF32() {
        if (strOffset >= strLength) {
            return EOF_ELEMENT;
        }

        int cp = (int)source.charAt(strOffset++);
        if (((cp & 0xfc00) == 0xd800) && (strOffset < strLength)) {
            // is a high surrogate cp
            int cp2 = (int)source.charAt(strOffset);
            if ((cp2 & 0xfc00) == 0xdc00) {
                // we have got suplementary low surrogate
                // so construct the final code point
                int wwww = (cp >> 6) & 0xf;                    
                cp = ((wwww + 1) << 16) | ((cp & 0x3f) << 10) | 
                        (cp2 & 0x3ff);

                ++strOffset;
            }
        }
        
        return cp;
    }
    
    /**
     * Returns the next encoded code point value from the normalized input
     * string. The methods of the <code>NormalizationTable</code> class can be
     * used to inspect the returned value. Returns <code>EOF_ELEMENT</code> if
     * the end of string is reached.
     *
     * @return the next encoded code point value from the normalized input 
     *      string or <code>EOF_ELEMENT</code> if the end of string is reached
     * @see NormalizationTable
     */
    public int getNextElement() {
        if (decOffset < decLength) {
            return decomposition[decOffset++];
        }
        
        int value = nextUTF32();
        if (value == EOF_ELEMENT) {
            return EOF_ELEMENT;
        }

        value = table.getCanonicalDecomposition(decomposition, 0, 
                value);
        
        if (NormalizationTable.isSingleCodePoint(value)) {
            if (NormalizationTable.isStable(value)) {
                return value;
            }
            decomposition[0] = value;
            decLength = 1;
        } else {
            decLength = value;
        }
        
        decOffset = 0;
        
        // decompose till we get a stable code point
        value = nextUTF32();
        while (value != -1) {
            if ((decLength + maxDecomposition) > decomposition.length) {
                int[] newDecomposition = new int[decomposition.length + 
                        CAPACITY_INCREMENT];
                System.arraycopy(decomposition, 0, newDecomposition, 0, 
                        decLength);
                decomposition = newDecomposition;
            }
            
            value = table.getCanonicalDecomposition(decomposition, decLength, 
                    value);
            
            if (NormalizationTable.isSingleCodePoint(value)) {
                decomposition[decLength++] = value;
                if (NormalizationTable.isStable(value)) {
                    break;
                }
            } else {
                decLength += value;
            }
            
            value = nextUTF32();
        }
      
        // order the code points according to their combining classes
        boolean checkOrder;
        do {
            checkOrder = false;
            
            for (int i = 1; i < decLength; ++i) {
                int cp1 = decomposition[i - 1];
                int cp2 = decomposition[i];

                int cc1 = NormalizationTable.getCombiningClass(cp1);
                int cc2 = NormalizationTable.getCombiningClass(cp2);
                
                if ((cc1 > cc2) && (cc2 != 0)) {
                    decomposition[i - 1] = cp2;
                    decomposition[i] = cp1;
                    checkOrder = true;
                }
            }
        } while (checkOrder);
        
        if (decLength > 0) {
            return decomposition[decOffset++];
        }
        
        return EOF_ELEMENT;
    }
}