StringCollatorpublic class StringCollator extends Object implements StringDecomposerA string collator is responsible for a conversion of strings from their
canonically decomposed forms to collation elements. It requires a decomposer
which is able to convert a string to its canonically decomposed equivalent
(Normalization Form D). |
Fields Summary |
---|
private static final int | CAPACITY_INCREMENTThe capacity increment value of the internal buffers. | private int[] | collationInternal collation buffer. | private int | colOffsetCollation offset. | private int | colLengthCollation length. | private int[] | decompositionInternal decomposition buffer. | private int | decOffsetDecomposition offset. | private int | decLengthDecomposition length. | private int | maxContractionThe longest possible contraction in the table. | private StringDecomposer | sourceA decomposer for the canonical decomposition of strings. | private CollationElementTable | tableA lookup table to convert from characers to their collation element
representation. | private int[] | savedBookmarksArray of the saved bookmarks. | private int[] | maxMatchInternal buffer. | private int[] | matchInternal buffer. | private int | maxLengthMax length. | private int | prefixLengthLength of the prefix. | private int | maxBookmarkMax bookmark. |
Constructors Summary |
---|
public StringCollator(StringDecomposer decomposer, CollationElementTable table)Creates a new instance of StringCollator .
this.source = decomposer;
this.table = table;
maxContraction = table.getMaxContractionLength();
savedBookmarks = new int[maxContraction];
match = new int[maxContraction];
maxMatch = new int[maxContraction];
|
Methods Summary |
---|
public final int | getNextElement()Returns the next encoded collation element from the input string. The
returned value can be inspected by the methods of the
CollationElementTable class. Returns
EOF_ELEMENT if the end of string is reached.
if (colOffset < colLength) {
return collation[colOffset++];
}
int value;
if ((value = getNextSourceElement()) ==
StringNormalizer.EOF_ELEMENT) {
return EOF_ELEMENT;
}
value = table.getCollationElements(collation, 0,
NormalizationTable.getCodePoint(value));
if (CollationElementTable.isBookmark(value)) {
value = match(value);
value = table.getCollationElements(collation, 0, value);
// check for INVALID_BOOKMARK_VALUE?
}
if (CollationElementTable.isSingleCollationEl(value)) {
return value;
}
colOffset = 1;
colLength = value;
return collation[0];
| private final int | getNextSourceElement()Returns the next encoded code point value from the normalized input
string (in Normalization Form D). It skips all ignorable code points
and reorders code points with the logical order exception.
if (decOffset < decLength) {
return decomposition[decOffset++];
}
int ecp;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
return ecp;
}
} while (NormalizationTable.isIgnorable(ecp));
if (NormalizationTable.hasLogicalOrderException(ecp)) {
decomposition[0] = ecp;
decOffset = 0;
decLength = 1;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
return decomposition[decOffset++];
}
} while (NormalizationTable.isIgnorable(ecp));
}
return ecp;
| private final int | match(int initialBookmark)Tries to find the longest possible code point sequence from the collation
element table that have a match at the current position of the normalized
input string.
int i;
int bookmark = initialBookmark;
int ecp;
for (i = 0; i < maxContraction; ++i) {
if ((ecp = peekSourceElement(i)) == StringNormalizer.EOF_ELEMENT) {
break;
}
bookmark = table.getChildBookmark(bookmark,
NormalizationTable.getCodePoint(ecp));
if (bookmark == CollationElementTable.INVALID_BOOKMARK_VALUE) {
break;
}
savedBookmarks[i] = bookmark;
}
do {
for (--i; i >= 0; --i) {
if (NormalizationTable.isStable(
decomposition[decOffset + i])) {
break;
}
}
bookmark = (i >= 0) ? savedBookmarks[i] : initialBookmark;
bookmark = matchTail(bookmark, i + 1);
if (bookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
return bookmark;
}
bookmark = (i >= 0) ? savedBookmarks[i] : initialBookmark;
bookmark = table.getChildBookmark(bookmark,
CollationElementTable.TERMINAL_CODE_POINT);
if (bookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
decOffset += i + 1;
if (decOffset == decLength) {
decOffset = 0;
decLength = 0;
}
return bookmark;
}
} while (i > 0);
return initialBookmark;
| private final int | matchTail(int bookmark, int index)Tries to find the longest possible match among code points with non-zero
combining class. Some of them can be skipped to form the longest matching
code point sequence. This implies usage of a recursive algorithm
(matchTailAux ).
maxLength = 0;
maxBookmark = CollationElementTable.INVALID_BOOKMARK_VALUE;
prefixLength = index;
matchTailAux(bookmark, index, 0, -1);
if (maxBookmark != CollationElementTable.INVALID_BOOKMARK_VALUE) {
decOffset += index;
int length = decLength - decOffset;
if (length == maxLength) {
decOffset = 0;
decLength = 0;
return maxBookmark;
}
for (int i = 0, j = 0, k = 0; j < length; ++j) {
int cp = decomposition[decOffset + j];
if ((k < maxLength) && (cp == maxMatch[k])) {
++k;
} else {
decomposition[i] = cp;
++i;
}
}
decOffset = 0;
decLength -= maxLength + index;
}
return maxBookmark;
| private final void | matchTailAux(int bookmark, int index, int accepted, int lastCC)A helper recursive method for matchTail .
if ((prefixLength + accepted) >= maxContraction) {
return;
}
int ecp, cc;
if ((ecp = peekSourceElement(index)) ==
StringNormalizer.EOF_ELEMENT) {
return;
}
cc = NormalizationTable.getCombiningClass(ecp);
while (cc == lastCC) {
++index;
if ((ecp = peekSourceElement(index)) ==
StringNormalizer.EOF_ELEMENT) {
return;
}
cc = NormalizationTable.getCombiningClass(ecp);
}
if (cc == 0) {
return;
}
// IMPL_NOTE: reorder?
// 1. skip
matchTailAux(bookmark, index + 1, accepted, cc);
int bookmark2 = table.getChildBookmark(bookmark,
NormalizationTable.getCodePoint(ecp));
if (bookmark2 != CollationElementTable.INVALID_BOOKMARK_VALUE) {
// 2. accept
match[accepted] = ecp;
matchTailAux(bookmark2, index + 1, accepted + 1, lastCC);
if ((accepted + 1) > maxLength) {
int bookmark3 = table.getChildBookmark(bookmark2,
CollationElementTable.TERMINAL_CODE_POINT);
if (bookmark3 != CollationElementTable.INVALID_BOOKMARK_VALUE) {
// 3. terminate
maxLength = accepted + 1;
maxBookmark = bookmark3;
System.arraycopy(match, 0, maxMatch, 0, maxLength);
}
}
}
| private final int | peekSourceElement(int offset)Returns the encoded code point value from the normalized input string
(in Normalization Form D) on the specified offset. It doesn't
automatically discard the returned element. So two successive calls
to peekSourceElement(0) return the same value. The method
skips all ignorable code points and reorders code points with the
logical order exception.
if ((decOffset + offset) < decLength) {
return decomposition[decOffset + offset];
}
// if ((decOffset + offset) >= (decLength + 1)) {
// throw new IllegalStateException();
// }
int ecp;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
return ecp;
}
} while (NormalizationTable.isIgnorable(ecp));
offset += decOffset;
if ((offset + 1) >= decomposition.length) {
offset -= decOffset;
decLength -= decOffset;
// reserve for possible logical order exception
if (decOffset <= 1) {
// allocate a new array
int[] newDecomposition = new int[decomposition.length +
CAPACITY_INCREMENT];
System.arraycopy(decomposition, decOffset, newDecomposition, 0,
decLength);
decomposition = newDecomposition;
} else {
// shift only
System.arraycopy(decomposition, decOffset,
decomposition, 0, decLength);
}
decOffset = 0;
}
if (NormalizationTable.hasLogicalOrderException(ecp)) {
int lastCP = ecp;
do {
ecp = source.getNextElement();
if (ecp == StringNormalizer.EOF_ELEMENT) {
decomposition[offset] = lastCP;
return lastCP;
}
} while (NormalizationTable.isIgnorable(ecp));
decomposition[offset++] = ecp;
decomposition[offset] = lastCP;
decLength += 2;
return ecp;
}
decomposition[offset] = ecp;
++decLength;
return ecp;
| public final void | reset()Restarts the decomposition.
decOffset = 0;
decLength = 0;
colOffset = 0;
colLength = 0;
// source.reset();
|
|