RBTableBuilderpublic final class RBTableBuilder extends Object This class contains all the code to parse a RuleBasedCollator pattern
and build a RBCollationTables object from it. A particular instance
of tis class exists only during the actual build process-- once an
RBCollationTables object has been built, the RBTableBuilder object
goes away. This object carries all of the state which is only needed
during the build process, plus a "shadow" copy of all of the state
that will go into the tables object itself. This object communicates
with RBCollationTables through a separate class, RBCollationTables.BuildAPI,
this is an inner class of RBCollationTables and provides a separate
private API for communication with RBTableBuilder.
This class isn't just an inner class of RBCollationTables itself because
of its large size. For source-code readability, it seemed better for the
builder to have its own source file. |
Fields Summary |
---|
static final int | CHARINDEX | private static final int | IGNORABLEMASK | private static final int | PRIMARYORDERINCREMENT | private static final int | SECONDARYORDERINCREMENT | private static final int | TERTIARYORDERINCREMENT | private static final int | INITIALTABLESIZE | private static final int | MAXKEYSIZE | private RBCollationTables$BuildAPI | tables | private MergeCollation | mPattern | private boolean | isOverIgnore | private char[] | keyBuf | private IntHashtable | contractFlags | private boolean | frenchSec | private boolean | seAsianSwapping | private UCompactIntArray | mapping | private Vector | contractTable | private Vector | expandTable | private short | maxSecOrder | private short | maxTerOrder |
Methods Summary |
---|
private void | addComposedChars()Add expanding entries for pre-composed unicode characters so that this
collator can be used reasonably well with decomposition turned off.
// Iterate through all of the pre-composed characters in Unicode
ComposedCharIter iter = new ComposedCharIter();
int c;
while ((c = iter.next()) != ComposedCharIter.DONE) {
if (getCharOrder(c) == RBCollationTables.UNMAPPED) {
//
// We don't already have an ordering for this pre-composed character.
//
// First, see if the decomposed string is already in our
// tables as a single contracting-string ordering.
// If so, just map the precomposed character to that order.
//
// TODO: What we should really be doing here is trying to find the
// longest initial substring of the decomposition that is present
// in the tables as a contracting character sequence, and find its
// ordering. Then do this recursively with the remaining chars
// so that we build a list of orderings, and add that list to
// the expansion table.
// That would be more correct but also significantly slower, so
// I'm not totally sure it's worth doing.
//
String s = iter.decomposition();
//sherman/Note: if this is 1 character decomposed string, the
//only thing need to do is to check if this decomposed character
//has an entry in our order table, this order is not necessary
//to be a contraction order, if it does have one, add an entry
//for the precomposed character by using the same order, the
//previous impl unnecessarily adds a single character expansion
//entry.
if (s.length() == 1) {
int order = getCharOrder(s.charAt(0));
if (order != RBCollationTables.UNMAPPED) {
addOrder(c, order);
}
continue;
} else if (s.length() == 2) {
char ch0 = s.charAt(0);
if (Character.isHighSurrogate(ch0)) {
int order = getCharOrder(s.codePointAt(0));
if (order != RBCollationTables.UNMAPPED) {
addOrder(c, order);
}
continue;
}
}
int contractOrder = getContractOrder(s);
if (contractOrder != RBCollationTables.UNMAPPED) {
addOrder(c, contractOrder);
} else {
//
// We don't have a contracting ordering for the entire string
// that results from the decomposition, but if we have orders
// for each individual character, we can add an expanding
// table entry for the pre-composed character
//
boolean allThere = true;
for (int i = 0; i < s.length(); i++) {
if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {
allThere = false;
break;
}
}
if (allThere) {
addExpandOrder(c, s, RBCollationTables.UNMAPPED);
}
}
}
}
| private void | addContractFlags(java.lang.String chars)
char c0;
int c;
int len = chars.length();
for (int i = 0; i < len; i++) {
c0 = chars.charAt(i);
c = Character.isHighSurrogate(c0)
?Character.toCodePoint(c0, chars.charAt(++i))
:c0;
contractFlags.put(c, 1);
}
| private final void | addContractOrder(java.lang.String groupChars, int anOrder)
addContractOrder(groupChars, anOrder, true);
| private final void | addContractOrder(java.lang.String groupChars, int anOrder, boolean fwd)Adds the contracting string into the collation table.
if (contractTable == null) {
contractTable = new Vector(INITIALTABLESIZE);
}
//initial character
int ch = groupChars.codePointAt(0);
/*
char ch0 = groupChars.charAt(0);
int ch = Character.isHighSurrogate(ch0)?
Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
*/
// See if the initial character of the string already has a contract table.
int entry = mapping.elementAt(ch);
Vector entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);
if (entryTable == null) {
// We need to create a new table of contract entries for this base char
int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();
entryTable = new Vector(INITIALTABLESIZE);
contractTable.addElement(entryTable);
// Add the initial character's current ordering first. then
// update its mapping to point to this contract table
entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry));
mapping.setElementAt(ch, tableIndex);
}
// Now add (or replace) this string in the table
int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);
if (index != RBCollationTables.UNMAPPED) {
EntryPair pair = (EntryPair) entryTable.elementAt(index);
pair.value = anOrder;
} else {
EntryPair pair = (EntryPair)entryTable.lastElement();
// NOTE: This little bit of logic is here to speed CollationElementIterator
// .nextContractChar(). This code ensures that the longest sequence in
// this list is always the _last_ one in the list. This keeps
// nextContractChar() from having to search the entire list for the longest
// sequence.
if (groupChars.length() > pair.entryName.length()) {
entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));
} else {
entryTable.insertElementAt(new EntryPair(groupChars, anOrder,
fwd), entryTable.size() - 1);
}
}
// If this was a forward mapping for a contracting string, also add a
// reverse mapping for it, so that CollationElementIterator.previous
// can work right
if (fwd && groupChars.length() > 1) {
addContractFlags(groupChars);
addContractOrder(new StringBuffer(groupChars).reverse().toString(),
anOrder, false);
}
| private final void | addExpandOrder(java.lang.String contractChars, java.lang.String expandChars, int anOrder)Adds the expanding string into the collation table.
// Create an expansion table entry
int tableIndex = addExpansion(anOrder, expandChars);
// And add its index into the main mapping table
if (contractChars.length() > 1) {
char ch = contractChars.charAt(0);
if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {
char ch2 = contractChars.charAt(1);
if (Character.isLowSurrogate(ch2)) {
//only add into table when it is a legal surrogate
addOrder(Character.toCodePoint(ch, ch2), tableIndex);
}
} else {
addContractOrder(contractChars, tableIndex);
}
} else {
addOrder(contractChars.charAt(0), tableIndex);
}
| private final void | addExpandOrder(int ch, java.lang.String expandChars, int anOrder)
int tableIndex = addExpansion(anOrder, expandChars);
addOrder(ch, tableIndex);
| private int | addExpansion(int anOrder, java.lang.String expandChars)Create a new entry in the expansion table that contains the orderings
for the given characers. If anOrder is valid, it is added to the
beginning of the expanded list of orders.
if (expandTable == null) {
expandTable = new Vector(INITIALTABLESIZE);
}
// If anOrder is valid, we want to add it at the beginning of the list
int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;
int[] valueList = new int[expandChars.length() + offset];
if (offset == 1) {
valueList[0] = anOrder;
}
int j = offset;
for (int i = 0; i < expandChars.length(); i++) {
char ch0 = expandChars.charAt(i);
char ch1;
int ch;
if (Character.isHighSurrogate(ch0)) {
if (++i == expandChars.length() ||
!Character.isLowSurrogate(ch1=expandChars.charAt(i))) {
//ether we are missing the low surrogate or the next char
//is not a legal low surrogate, so stop loop
break;
}
ch = Character.toCodePoint(ch0, ch1);
} else {
ch = ch0;
}
int mapValue = getCharOrder(ch);
if (mapValue != RBCollationTables.UNMAPPED) {
valueList[j++] = mapValue;
} else {
// can't find it in the table, will be filled in by commit().
valueList[j++] = CHARINDEX + ch;
}
}
if (j < valueList.length) {
//we had at least one supplementary character, the size of valueList
//is bigger than it really needs...
int[] tmpBuf = new int[j];
while (--j >= 0) {
tmpBuf[j] = valueList[j];
}
valueList = tmpBuf;
}
// Add the expanding char list into the expansion table.
int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();
expandTable.addElement(valueList);
return tableIndex;
| private final void | addOrder(int ch, int anOrder)Adds a character and its designated order into the collation table.
// See if the char already has an order in the mapping table
int order = mapping.elementAt(ch);
if (order >= RBCollationTables.CONTRACTCHARINDEX) {
// There's already an entry for this character that points to a contracting
// character table. Instead of adding the character directly to the mapping
// table, we must add it to the contract table instead.
int length = 1;
if (Character.isSupplementaryCodePoint(ch)) {
length = Character.toChars(ch, keyBuf, 0);
} else {
keyBuf[0] = (char)ch;
}
addContractOrder(new String(keyBuf, 0, length), anOrder);
} else {
// add the entry to the mapping table,
// the same later entry replaces the previous one
mapping.setElementAt(ch, anOrder);
}
| public void | build(java.lang.String pattern, int decmp)Create a table-based collation object with the given rules.
This is the main function that actually builds the tables and
stores them back in the RBCollationTables object. It is called
ONLY by the RBCollationTables constructor.
boolean isSource = true;
int i = 0;
String expChars;
String groupChars;
if (pattern.length() == 0)
throw new ParseException("Build rules empty.", 0);
// This array maps Unicode characters to their collation ordering
mapping = new UCompactIntArray((int)RBCollationTables.UNMAPPED);
// Normalize the build rules. Find occurances of all decomposed characters
// and normalize the rules before feeding into the builder. By "normalize",
// we mean that all precomposed Unicode characters must be converted into
// a base character and one or more combining characters (such as accents).
// When there are multiple combining characters attached to a base character,
// the combining characters must be in their canonical order
//
// sherman/Note:
//(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose
//hangual syllables to jamos, so we can actually just call decompose with
//normalizer's IGNORE_HANGUL option turned on
//
//(2)just call the "special version" in NormalizerImpl directly
//pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);
//
//Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp);
//pattern = Normalizer.normalize(pattern, mode, 0, true);
pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);
// Build the merged collation entries
// Since rules can be specified in any order in the string
// (e.g. "c , C < d , D < e , E .... C < CH")
// this splits all of the rules in the string out into separate
// objects and then sorts them. In the above example, it merges the
// "C < CH" rule in just before the "C < D" rule.
//
mPattern = new MergeCollation(pattern);
int order = 0;
// Now walk though each entry and add it to my own tables
for (i = 0; i < mPattern.getCount(); ++i)
{
PatternEntry entry = mPattern.getItemAt(i);
if (entry != null) {
groupChars = entry.getChars();
if (groupChars.length() > 1) {
switch(groupChars.charAt(groupChars.length()-1)) {
case '@":
frenchSec = true;
groupChars = groupChars.substring(0, groupChars.length()-1);
break;
case '!":
seAsianSwapping = true;
groupChars = groupChars.substring(0, groupChars.length()-1);
break;
}
}
order = increment(entry.getStrength(), order);
expChars = entry.getExtension();
if (expChars.length() != 0) {
addExpandOrder(groupChars, expChars, order);
} else if (groupChars.length() > 1) {
char ch = groupChars.charAt(0);
if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {
addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);
} else {
addContractOrder(groupChars, order);
}
} else {
char ch = groupChars.charAt(0);
addOrder(ch, order);
}
}
}
addComposedChars();
commit();
mapping.compact();
/*
System.out.println("mappingSize=" + mapping.getKSize());
for (int j = 0; j < 0xffff; j++) {
int value = mapping.elementAt(j);
if (value != RBCollationTables.UNMAPPED)
System.out.println("index=" + Integer.toString(j, 16)
+ ", value=" + Integer.toString(value, 16));
}
*/
tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,
contractFlags, maxSecOrder, maxTerOrder);
| private final void | commit()Look up for unmapped values in the expanded character table.
When the expanding character tables are built by addExpandOrder,
it doesn't know what the final ordering of each character
in the expansion will be. Instead, it just puts the raw character
code into the table, adding CHARINDEX as a flag. Now that we've
finished building the mapping table, we can go back and look up
that character to see what its real collation order is and
stick that into the expansion table. That lets us avoid doing
a two-stage lookup later.
if (expandTable != null) {
for (int i = 0; i < expandTable.size(); i++) {
int[] valueList = (int [])expandTable.elementAt(i);
for (int j = 0; j < valueList.length; j++) {
int order = valueList[j];
if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {
// found a expanding character that isn't filled in yet
int ch = order - CHARINDEX;
// Get the real values for the non-filled entry
int realValue = getCharOrder(ch);
if (realValue == RBCollationTables.UNMAPPED) {
// The real value is still unmapped, maybe it's ignorable
valueList[j] = IGNORABLEMASK & ch;
} else {
// just fill in the value
valueList[j] = realValue;
}
}
}
}
}
| private final int | getCharOrder(int ch)
int order = mapping.elementAt(ch);
if (order >= RBCollationTables.CONTRACTCHARINDEX) {
Vector groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);
EntryPair pair = (EntryPair)groupList.firstElement();
order = pair.value;
}
return order;
| private int | getContractOrder(java.lang.String groupChars)If the given string has been specified as a contracting string
in this collation table, return its ordering.
Otherwise return UNMAPPED.
int result = RBCollationTables.UNMAPPED;
if (contractTable != null) {
int ch = groupChars.codePointAt(0);
/*
char ch0 = groupChars.charAt(0);
int ch = Character.isHighSurrogate(ch0)?
Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
*/
Vector entryTable = getContractValues(ch);
if (entryTable != null) {
int index = RBCollationTables.getEntry(entryTable, groupChars, true);
if (index != RBCollationTables.UNMAPPED) {
EntryPair pair = (EntryPair) entryTable.elementAt(index);
result = pair.value;
}
}
}
return result;
| private java.util.Vector | getContractValues(int ch)Get the entry of hash table of the contracting string in the collation
table.
int index = mapping.elementAt(ch);
return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);
| private java.util.Vector | getContractValuesImpl(int index)
if (index >= 0)
{
return (Vector)contractTable.elementAt(index);
}
else // not found
{
return null;
}
| private final int | increment(int aStrength, int lastValue)Increment of the last order based on the comparison level.
switch(aStrength)
{
case Collator.PRIMARY:
// increment priamry order and mask off secondary and tertiary difference
lastValue += PRIMARYORDERINCREMENT;
lastValue &= RBCollationTables.PRIMARYORDERMASK;
isOverIgnore = true;
break;
case Collator.SECONDARY:
// increment secondary order and mask off tertiary difference
lastValue += SECONDARYORDERINCREMENT;
lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;
// record max # of ignorable chars with secondary difference
if (!isOverIgnore)
maxSecOrder++;
break;
case Collator.TERTIARY:
// increment tertiary order
lastValue += TERTIARYORDERINCREMENT;
// record max # of ignorable chars with tertiary difference
if (!isOverIgnore)
maxTerOrder++;
break;
}
return lastValue;
|
|