/*
*
*
* Copyright 1990-2007 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 only, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License version 2 for more details (a copy is
* included at /legal/license.txt).
*
* You should have received a copy of the GNU General Public License
* version 2 along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 or visit www.sun.com if you need additional
* information or have any questions.
*/
package com.sun.j2me.global;
/**
* A string collator is responsible for a conversion of strings from their
* canonically decomposed forms to collation elements. It requires a decomposer
* which is able to convert a string to its canonically decomposed equivalent
* (Normalization Form D).
*/
public class StringCollator implements StringDecomposer {
/** The capacity increment value of the internal buffers. */
private static final int CAPACITY_INCREMENT = 64;
/** Internal collation buffer. */
private int[] collation = new int[CAPACITY_INCREMENT];
/** Collation offset. */
private int colOffset;
/** Collation length. */
private int colLength;
/** Internal decomposition buffer. */
private int[] decomposition = new int[CAPACITY_INCREMENT];
/** Decomposition offset. */
private int decOffset;
/** Decomposition length. */
private int decLength;
/** The longest possible contraction in the table. */
private int maxContraction;
/** A decomposer for the canonical decomposition of strings. */
private StringDecomposer source;
/**
* A lookup table to convert from characers to their collation element
* representation.
*/
private CollationElementTable table;
/**
* Creates a new instance of <code>StringCollator</code>.
*
* @param decomposer a decomposer for the canonical decomposition
* @param table a lookup table for the conversion
*/
public StringCollator(StringDecomposer decomposer,
CollationElementTable table) {
this.source = decomposer;
this.table = table;
maxContraction = table.getMaxContractionLength();
savedBookmarks = new int[maxContraction];
match = new int[maxContraction];
maxMatch = new int[maxContraction];
}
/**
* Restarts the decomposition.
*/
public final void reset() {
decOffset = 0;
decLength = 0;
colOffset = 0;
colLength = 0;
// source.reset();
}
/**
* Returns the next encoded code point value from the normalized input
* string (in Normalization Form D). It skips all ignorable code points
* and reorders code points with the logical order exception.
*
* @return the next encoded code point value from the normalized input
* string or <code>StringNormalizer.EOF_ELEMENT</code> if the end of
* string is reached.
*/
private final int getNextSourceElement() {
if (decOffset < decLength) {
return decomposition[decOffset++];
}
int ecp;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
return ecp;
}
} while (NormalizationTable.isIgnorable(ecp));
if (NormalizationTable.hasLogicalOrderException(ecp)) {
decomposition[0] = ecp;
decOffset = 0;
decLength = 1;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
return decomposition[decOffset++];
}
} while (NormalizationTable.isIgnorable(ecp));
}
return ecp;
}
/**
* Returns the encoded code point value from the normalized input string
* (in Normalization Form D) on the specified offset. It doesn't
* automatically discard the returned element. So two successive calls
* to <code>peekSourceElement(0)</code> return the same value. The method
* skips all ignorable code points and reorders code points with the
* logical order exception.
*
* @param offset the offset of the element to return
* @return the encoded code point value from the offset
*/
private final int peekSourceElement(int offset) {
if ((decOffset + offset) < decLength) {
return decomposition[decOffset + offset];
}
// if ((decOffset + offset) >= (decLength + 1)) {
// throw new IllegalStateException();
// }
int ecp;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
return ecp;
}
} while (NormalizationTable.isIgnorable(ecp));
offset += decOffset;
if ((offset + 1) >= decomposition.length) {
offset -= decOffset;
decLength -= decOffset;
// reserve for possible logical order exception
if (decOffset <= 1) {
// allocate a new array
int[] newDecomposition = new int[decomposition.length +
CAPACITY_INCREMENT];
System.arraycopy(decomposition, decOffset, newDecomposition, 0,
decLength);
decomposition = newDecomposition;
} else {
// shift only
System.arraycopy(decomposition, decOffset,
decomposition, 0, decLength);
}
decOffset = 0;
}
if (NormalizationTable.hasLogicalOrderException(ecp)) {
int lastCP = ecp;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
decomposition[offset] = lastCP;
return lastCP;
}
} while (NormalizationTable.isIgnorable(ecp));
decomposition[offset++] = ecp;
decomposition[offset] = lastCP;
decLength += 2;
return ecp;
}
decomposition[offset] = ecp;
++decLength;
return ecp;
}
/** Array of the saved bookmarks. */
private int[] savedBookmarks;
/** Internal buffer. */
private int[] maxMatch;
/** Internal buffer. */
private int[] match;
/** Max length. */
private int maxLength;
/** Length of the prefix. */
private int prefixLength;
/** Max bookmark. */
private int maxBookmark;
/**
* A helper recursive method for <code>matchTail</code>.
*
* @param bookmark the bookmark which represents the current code point
* sequence with a partial match in the collation element table
* @param index the index of the next tested code point
* @param accepted the number of code points which has been accepted so far
* @param lastCC the combining class of the previous code point
*/
private final void matchTailAux(int bookmark, int index,
int accepted, int lastCC) {
if ((prefixLength + accepted) >= maxContraction) {
return;
}
int ecp, cc;
if ((ecp = peekSourceElement(index)) ==
StringNormalizer.EOF_ELEMENT) {
return;
}
cc = NormalizationTable.getCombiningClass(ecp);
while (cc == lastCC) {
++index;
if ((ecp = peekSourceElement(index)) ==
StringNormalizer.EOF_ELEMENT) {
return;
}
cc = NormalizationTable.getCombiningClass(ecp);
}
if (cc == 0) {
return;
}
// IMPL_NOTE: reorder?
// 1. skip
matchTailAux(bookmark, index + 1, accepted, cc);
int bookmark2 = table.getChildBookmark(bookmark,
NormalizationTable.getCodePoint(ecp));
if (bookmark2 != CollationElementTable.INVALID_BOOKMARK_VALUE) {
// 2. accept
match[accepted] = ecp;
matchTailAux(bookmark2, index + 1, accepted + 1, lastCC);
if ((accepted + 1) > maxLength) {
int bookmark3 = table.getChildBookmark(bookmark2,
CollationElementTable.TERMINAL_CODE_POINT);
if (bookmark3 != CollationElementTable.INVALID_BOOKMARK_VALUE) {
// 3. terminate
maxLength = accepted + 1;
maxBookmark = bookmark3;
System.arraycopy(match, 0, maxMatch, 0, maxLength);
}
}
}
}
/**
* Tries to find the longest possible match among code points with non-zero
* combining class. Some of them can be skipped to form the longest matching
* code point sequence. This implies usage of a recursive algorithm
* (<code>matchTailAux</code>).
*
* @param bookmark the bookmark which represents the current code point
* sequence with a partial match in the collation element table
* @param index the index of the next code point
* @return the bookmark representing the longest matching code point
* sequence
*/
private final int matchTail(int bookmark, int index) {
maxLength = 0;
maxBookmark = CollationElementTable.INVALID_BOOKMARK_VALUE;
prefixLength = index;
matchTailAux(bookmark, index, 0, -1);
if (maxBookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
decOffset += index;
int length = decLength - decOffset;
if (length == maxLength) {
decOffset = 0;
decLength = 0;
return maxBookmark;
}
for (int i = 0, j = 0, k = 0; j < length; ++j) {
int cp = decomposition[decOffset + j];
if ((k < maxLength) && (cp == maxMatch[k])) {
++k;
} else {
decomposition[i] = cp;
++i;
}
}
decOffset = 0;
decLength -= maxLength + index;
}
return maxBookmark;
}
/**
* Tries to find the longest possible code point sequence from the collation
* element table that have a match at the current position of the normalized
* input string.
*
* @param initialBookmark the starting bookmark
* @return the bookmark representing the longest matching code point
* sequence
*/
private final int match(int initialBookmark) {
int i;
int bookmark = initialBookmark;
int ecp;
for (i = 0; i < maxContraction; ++i) {
if ((ecp = peekSourceElement(i)) == StringNormalizer.EOF_ELEMENT) {
break;
}
bookmark = table.getChildBookmark(bookmark,
NormalizationTable.getCodePoint(ecp));
if (bookmark == CollationElementTable.INVALID_BOOKMARK_VALUE) {
break;
}
savedBookmarks[i] = bookmark;
}
do {
for (--i; i >= 0; --i) {
if (NormalizationTable.isStable(
decomposition[decOffset + i])) {
break;
}
}
bookmark = (i >= 0) ? savedBookmarks[i] : initialBookmark;
bookmark = matchTail(bookmark, i + 1);
if (bookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
return bookmark;
}
bookmark = (i >= 0) ? savedBookmarks[i] : initialBookmark;
bookmark = table.getChildBookmark(bookmark,
CollationElementTable.TERMINAL_CODE_POINT);
if (bookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
decOffset += i + 1;
if (decOffset == decLength) {
decOffset = 0;
decLength = 0;
}
return bookmark;
}
} while (i > 0);
return initialBookmark;
}
/**
* Returns the next encoded collation element from the input string. The
* returned value can be inspected by the methods of the
* <code>CollationElementTable</code> class. Returns
* <code>EOF_ELEMENT</code> if the end of string is reached.
*
* @return the next encoded collation element or <code>EOF_ELEMENT</code>
* if the end of string is reached
* @see CollationElementTable
*/
public final int getNextElement() {
if (colOffset < colLength) {
return collation[colOffset++];
}
int value;
if ((value = getNextSourceElement()) ==
StringNormalizer.EOF_ELEMENT) {
return EOF_ELEMENT;
}
value = table.getCollationElements(collation, 0,
NormalizationTable.getCodePoint(value));
if (CollationElementTable.isBookmark(value)) {
value = match(value);
value = table.getCollationElements(collation, 0, value);
// check for INVALID_BOOKMARK_VALUE?
}
if (CollationElementTable.isSingleCollationEl(value)) {
return value;
}
colOffset = 1;
colLength = value;
return collation[0];
}
}
|