FileDocCategorySizeDatePackage
StringCollator.javaAPI DocphoneME MR2 API (J2ME)14197Wed May 02 18:00:46 BST 2007com.sun.j2me.global

StringCollator

public class StringCollator extends Object implements StringDecomposer
A string collator is responsible for a conversion of strings from their canonically decomposed forms to collation elements. It requires a decomposer which is able to convert a string to its canonically decomposed equivalent (Normalization Form D).

Fields Summary
private static final int
CAPACITY_INCREMENT
The capacity increment value of the internal buffers.
private int[]
collation
Internal collation buffer.
private int
colOffset
Collation offset.
private int
colLength
Collation length.
private int[]
decomposition
Internal decomposition buffer.
private int
decOffset
Decomposition offset.
private int
decLength
Decomposition length.
private int
maxContraction
The longest possible contraction in the table.
private StringDecomposer
source
A decomposer for the canonical decomposition of strings.
private CollationElementTable
table
A lookup table to convert from characers to their collation element representation.
private int[]
savedBookmarks
Array of the saved bookmarks.
private int[]
maxMatch
Internal buffer.
private int[]
match
Internal buffer.
private int
maxLength
Max length.
private int
prefixLength
Length of the prefix.
private int
maxBookmark
Max bookmark.
Constructors Summary
public StringCollator(StringDecomposer decomposer, CollationElementTable table)
Creates a new instance of StringCollator.

param
decomposer a decomposer for the canonical decomposition
param
table a lookup table for the conversion


                               
       
              
        this.source = decomposer;
        this.table = table;
        
        maxContraction = table.getMaxContractionLength();
        
        savedBookmarks = new int[maxContraction];

        match = new int[maxContraction];
        maxMatch = new int[maxContraction];
    
Methods Summary
public final intgetNextElement()
Returns the next encoded collation element from the input string. The returned value can be inspected by the methods of the CollationElementTable class. Returns EOF_ELEMENT if the end of string is reached.

return
the next encoded collation element or EOF_ELEMENT if the end of string is reached
see
CollationElementTable

        if (colOffset < colLength) {
            return collation[colOffset++];
        }
        
        int value;

        if ((value = getNextSourceElement()) == 
                StringNormalizer.EOF_ELEMENT) {
            return EOF_ELEMENT;
        }
        
        value = table.getCollationElements(collation, 0,
                NormalizationTable.getCodePoint(value));

        if (CollationElementTable.isBookmark(value)) {
            value = match(value);
            value = table.getCollationElements(collation, 0, value);
            // check for INVALID_BOOKMARK_VALUE?
        }

        if (CollationElementTable.isSingleCollationEl(value)) {
            return value;
        }

        colOffset = 1;
        colLength = value;
        return collation[0];
    
private final intgetNextSourceElement()
Returns the next encoded code point value from the normalized input string (in Normalization Form D). It skips all ignorable code points and reorders code points with the logical order exception.

return
the next encoded code point value from the normalized input string or StringNormalizer.EOF_ELEMENT if the end of string is reached.

        if (decOffset < decLength) {
            return decomposition[decOffset++];
        }
        
        int ecp;
        do {
            ecp = source.getNextElement();
            if (ecp == StringNormalizer.EOF_ELEMENT) {
                return ecp;
            }
        } while (NormalizationTable.isIgnorable(ecp));

        if (NormalizationTable.hasLogicalOrderException(ecp)) {
            decomposition[0] = ecp;
            decOffset = 0;
            decLength = 1;
            do {
                ecp = source.getNextElement();
                if (ecp == StringNormalizer.EOF_ELEMENT) {
                    return decomposition[decOffset++];
                }
            } while (NormalizationTable.isIgnorable(ecp));
        }

        return ecp;
    
private final intmatch(int initialBookmark)
Tries to find the longest possible code point sequence from the collation element table that have a match at the current position of the normalized input string.

param
initialBookmark the starting bookmark
return
the bookmark representing the longest matching code point sequence

        int i;
        
        int bookmark = initialBookmark;
        int ecp;
        for (i = 0; i < maxContraction; ++i) {
            if ((ecp = peekSourceElement(i)) == StringNormalizer.EOF_ELEMENT) {
                break;
            }
            bookmark = table.getChildBookmark(bookmark, 
                    NormalizationTable.getCodePoint(ecp));
            if (bookmark == CollationElementTable.INVALID_BOOKMARK_VALUE) {
                break;
            }
            savedBookmarks[i] = bookmark;
        }
        
        do {
            for (--i; i >= 0; --i) {
                if (NormalizationTable.isStable(
                        decomposition[decOffset + i])) {
                    break;
                }
            }
            
            bookmark = (i >= 0) ? savedBookmarks[i] : initialBookmark;
            bookmark = matchTail(bookmark, i + 1);
            if (bookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
                return bookmark;
            }
           
            bookmark = (i >= 0) ? savedBookmarks[i] : initialBookmark;
            bookmark = table.getChildBookmark(bookmark, 
                CollationElementTable.TERMINAL_CODE_POINT);
            if (bookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
                decOffset += i + 1;
                if (decOffset == decLength) {
                    decOffset = 0;
                    decLength = 0;
                }
                return bookmark;
            }

        } while (i > 0);
                
        return initialBookmark;
    
private final intmatchTail(int bookmark, int index)
Tries to find the longest possible match among code points with non-zero combining class. Some of them can be skipped to form the longest matching code point sequence. This implies usage of a recursive algorithm (matchTailAux).

param
bookmark the bookmark which represents the current code point sequence with a partial match in the collation element table
param
index the index of the next code point
return
the bookmark representing the longest matching code point sequence

        maxLength = 0;
        maxBookmark = CollationElementTable.INVALID_BOOKMARK_VALUE;
        prefixLength = index;
        matchTailAux(bookmark, index, 0, -1);

        if (maxBookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
            decOffset += index;
            int length = decLength - decOffset;
            if (length == maxLength) {
                decOffset = 0;
                decLength = 0;
                return maxBookmark;
            }
            for (int i = 0, j = 0, k = 0; j < length; ++j) {
                int cp = decomposition[decOffset + j];
                if ((k < maxLength) && (cp == maxMatch[k])) {
                    ++k;
                } else {
                    decomposition[i] = cp;
                    ++i;
                }
            }
            decOffset = 0;
            decLength -= maxLength + index;
        }

        return maxBookmark;
    
private final voidmatchTailAux(int bookmark, int index, int accepted, int lastCC)
A helper recursive method for matchTail.

param
bookmark the bookmark which represents the current code point sequence with a partial match in the collation element table
param
index the index of the next tested code point
param
accepted the number of code points which has been accepted so far
param
lastCC the combining class of the previous code point

        if ((prefixLength + accepted) >= maxContraction) {
            return;
        }

        int ecp, cc;
        if ((ecp = peekSourceElement(index)) == 
                StringNormalizer.EOF_ELEMENT) {
            return;
        }
        cc = NormalizationTable.getCombiningClass(ecp);
        while (cc == lastCC) {
            ++index;
            if ((ecp = peekSourceElement(index)) == 
                    StringNormalizer.EOF_ELEMENT) {
                return;
            }
            cc = NormalizationTable.getCombiningClass(ecp);
        }
        
        if (cc == 0) {
            return;
        }

        // IMPL_NOTE: reorder?
        // 1. skip
        matchTailAux(bookmark, index + 1, accepted, cc);
        int bookmark2 = table.getChildBookmark(bookmark, 
                NormalizationTable.getCodePoint(ecp));
        if (bookmark2 != CollationElementTable.INVALID_BOOKMARK_VALUE) {
            // 2. accept
            match[accepted] = ecp;
            matchTailAux(bookmark2, index + 1, accepted + 1, lastCC);
            
            if ((accepted + 1) > maxLength) {
                int bookmark3 = table.getChildBookmark(bookmark2, 
                        CollationElementTable.TERMINAL_CODE_POINT);
                if (bookmark3 != CollationElementTable.INVALID_BOOKMARK_VALUE) {
                    // 3. terminate
                    maxLength = accepted + 1;
                    maxBookmark = bookmark3;
                    System.arraycopy(match, 0, maxMatch, 0, maxLength);
                }
            }
        }
    
private final intpeekSourceElement(int offset)
Returns the encoded code point value from the normalized input string (in Normalization Form D) on the specified offset. It doesn't automatically discard the returned element. So two successive calls to peekSourceElement(0) return the same value. The method skips all ignorable code points and reorders code points with the logical order exception.

param
offset the offset of the element to return
return
the encoded code point value from the offset

        if ((decOffset + offset) < decLength) {
            return decomposition[decOffset + offset];
        }
        
//        if ((decOffset + offset) >= (decLength + 1)) {
//            throw new IllegalStateException();
//        }
       
        int ecp;
        do {
            ecp = source.getNextElement();
            if (ecp == StringNormalizer.EOF_ELEMENT) {
                return ecp;
            }
        } while (NormalizationTable.isIgnorable(ecp));

        offset += decOffset;
        
        if ((offset + 1) >= decomposition.length) {
            offset -= decOffset;
            decLength -= decOffset;
            // reserve for possible logical order exception
            if (decOffset <= 1) {
                // allocate a new array
                int[] newDecomposition = new int[decomposition.length + 
                        CAPACITY_INCREMENT];
                System.arraycopy(decomposition, decOffset, newDecomposition, 0, 
                        decLength);
                decomposition = newDecomposition;               
            } else {
                // shift only
                System.arraycopy(decomposition, decOffset,
                        decomposition, 0, decLength);
            }
            decOffset = 0;
        }
        
        if (NormalizationTable.hasLogicalOrderException(ecp)) {
            int lastCP = ecp;
            do {
                ecp = source.getNextElement();
                if (ecp == StringNormalizer.EOF_ELEMENT) {
                    decomposition[offset] = lastCP;
                    return lastCP;
                }
            } while (NormalizationTable.isIgnorable(ecp));

            decomposition[offset++] = ecp;
            decomposition[offset] = lastCP;
            decLength += 2;
            return ecp;
        }

        decomposition[offset] = ecp;
        ++decLength;
        return ecp;
    
public final voidreset()
Restarts the decomposition.

        decOffset = 0;
        decLength = 0;
        colOffset = 0;
        colLength = 0;
//      source.reset();