File Doc Category Size Date Package
RBTableBuilder.java API Doc Java SE 6 API 24203 Tue Jun 10 00:25:52 BST 2008 java.text

RBTableBuilder

java.lang.Object

public final class RBTableBuilder extends Object

This class contains all the code to parse a RuleBasedCollator pattern and build a RBCollationTables object from it. A particular instance of tis class exists only during the actual build process-- once an RBCollationTables object has been built, the RBTableBuilder object goes away. This object carries all of the state which is only needed during the build process, plus a "shadow" copy of all of the state that will go into the tables object itself. This object communicates with RBCollationTables through a separate class, RBCollationTables.BuildAPI, this is an inner class of RBCollationTables and provides a separate private API for communication with RBTableBuilder. This class isn't just an inner class of RBCollationTables itself because of its large size. For source-code readability, it seemed better for the builder to have its own source file.

Fields Summary
static final int
CHARINDEX
private static final int
IGNORABLEMASK
private static final int
PRIMARYORDERINCREMENT
private static final int
SECONDARYORDERINCREMENT
private static final int
TERTIARYORDERINCREMENT
private static final int
INITIALTABLESIZE
private static final int
MAXKEYSIZE
private RBCollationTables$BuildAPI
tables
private MergeCollation
mPattern
private boolean
isOverIgnore
private char[]
keyBuf
private IntHashtable
contractFlags
private boolean
frenchSec
private boolean
seAsianSwapping
private UCompactIntArray
mapping
private Vector
contractTable
private Vector
expandTable
private short
maxSecOrder
private short
maxTerOrder
Constructors Summary
public RBTableBuilder(RBCollationTables$BuildAPI tables)
this.tables = tables;
Methods Summary
private void addComposedChars()
Add expanding entries for pre-composed unicode characters so that this collator can be used reasonably well with decomposition turned off.
// Iterate through all of the pre-composed characters in Unicode ComposedCharIter iter = new ComposedCharIter(); int c; while ((c = iter.next()) != ComposedCharIter.DONE) { if (getCharOrder(c) == RBCollationTables.UNMAPPED) { // // We don't already have an ordering for this pre-composed character. // // First, see if the decomposed string is already in our // tables as a single contracting-string ordering. // If so, just map the precomposed character to that order. // // TODO: What we should really be doing here is trying to find the // longest initial substring of the decomposition that is present // in the tables as a contracting character sequence, and find its // ordering. Then do this recursively with the remaining chars // so that we build a list of orderings, and add that list to // the expansion table. // That would be more correct but also significantly slower, so // I'm not totally sure it's worth doing. // String s = iter.decomposition(); //sherman/Note: if this is 1 character decomposed string, the //only thing need to do is to check if this decomposed character //has an entry in our order table, this order is not necessary //to be a contraction order, if it does have one, add an entry //for the precomposed character by using the same order, the //previous impl unnecessarily adds a single character expansion //entry. if (s.length() == 1) { int order = getCharOrder(s.charAt(0)); if (order != RBCollationTables.UNMAPPED) { addOrder(c, order); } continue; } else if (s.length() == 2) { char ch0 = s.charAt(0); if (Character.isHighSurrogate(ch0)) { int order = getCharOrder(s.codePointAt(0)); if (order != RBCollationTables.UNMAPPED) { addOrder(c, order); } continue; } } int contractOrder = getContractOrder(s); if (contractOrder != RBCollationTables.UNMAPPED) { addOrder(c, contractOrder); } else { // // We don't have a contracting ordering for the entire string // that results from the decomposition, but if we have orders // for each individual character, we can add an expanding // table entry for the pre-composed character // boolean allThere = true; for (int i = 0; i < s.length(); i++) { if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) { allThere = false; break; } } if (allThere) { addExpandOrder(c, s, RBCollationTables.UNMAPPED); } } } }
private void addContractFlags(java.lang.String chars)
char c0; int c; int len = chars.length(); for (int i = 0; i < len; i++) { c0 = chars.charAt(i); c = Character.isHighSurrogate(c0) ?Character.toCodePoint(c0, chars.charAt(++i)) :c0; contractFlags.put(c, 1); }
private final void addContractOrder(java.lang.String groupChars, int anOrder)
addContractOrder(groupChars, anOrder, true);
private final void addContractOrder(java.lang.String groupChars, int anOrder, boolean fwd)
Adds the contracting string into the collation table.
if (contractTable == null) { contractTable = new Vector(INITIALTABLESIZE); } //initial character int ch = groupChars.codePointAt(0); /* char ch0 = groupChars.charAt(0); int ch = Character.isHighSurrogate(ch0)? Character.toCodePoint(ch0, groupChars.charAt(1)):ch0; */ // See if the initial character of the string already has a contract table. int entry = mapping.elementAt(ch); Vector entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX); if (entryTable == null) { // We need to create a new table of contract entries for this base char int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size(); entryTable = new Vector(INITIALTABLESIZE); contractTable.addElement(entryTable); // Add the initial character's current ordering first. then // update its mapping to point to this contract table entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry)); mapping.setElementAt(ch, tableIndex); } // Now add (or replace) this string in the table int index = RBCollationTables.getEntry(entryTable, groupChars, fwd); if (index != RBCollationTables.UNMAPPED) { EntryPair pair = (EntryPair) entryTable.elementAt(index); pair.value = anOrder; } else { EntryPair pair = (EntryPair)entryTable.lastElement(); // NOTE: This little bit of logic is here to speed CollationElementIterator // .nextContractChar(). This code ensures that the longest sequence in // this list is always the _last_ one in the list. This keeps // nextContractChar() from having to search the entire list for the longest // sequence. if (groupChars.length() > pair.entryName.length()) { entryTable.addElement(new EntryPair(groupChars, anOrder, fwd)); } else { entryTable.insertElementAt(new EntryPair(groupChars, anOrder, fwd), entryTable.size() - 1); } } // If this was a forward mapping for a contracting string, also add a // reverse mapping for it, so that CollationElementIterator.previous // can work right if (fwd && groupChars.length() > 1) { addContractFlags(groupChars); addContractOrder(new StringBuffer(groupChars).reverse().toString(), anOrder, false); }
private final void addExpandOrder(java.lang.String contractChars, java.lang.String expandChars, int anOrder)
Adds the expanding string into the collation table.
// Create an expansion table entry int tableIndex = addExpansion(anOrder, expandChars); // And add its index into the main mapping table if (contractChars.length() > 1) { char ch = contractChars.charAt(0); if (Character.isHighSurrogate(ch) && contractChars.length() == 2) { char ch2 = contractChars.charAt(1); if (Character.isLowSurrogate(ch2)) { //only add into table when it is a legal surrogate addOrder(Character.toCodePoint(ch, ch2), tableIndex); } } else { addContractOrder(contractChars, tableIndex); } } else { addOrder(contractChars.charAt(0), tableIndex); }
private final void addExpandOrder(int ch, java.lang.String expandChars, int anOrder)
int tableIndex = addExpansion(anOrder, expandChars); addOrder(ch, tableIndex);
private int addExpansion(int anOrder, java.lang.String expandChars)
Create a new entry in the expansion table that contains the orderings for the given characers. If anOrder is valid, it is added to the beginning of the expanded list of orders.
if (expandTable == null) { expandTable = new Vector(INITIALTABLESIZE); } // If anOrder is valid, we want to add it at the beginning of the list int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1; int[] valueList = new int[expandChars.length() + offset]; if (offset == 1) { valueList[0] = anOrder; } int j = offset; for (int i = 0; i < expandChars.length(); i++) { char ch0 = expandChars.charAt(i); char ch1; int ch; if (Character.isHighSurrogate(ch0)) { if (++i == expandChars.length() || !Character.isLowSurrogate(ch1=expandChars.charAt(i))) { //ether we are missing the low surrogate or the next char //is not a legal low surrogate, so stop loop break; } ch = Character.toCodePoint(ch0, ch1); } else { ch = ch0; } int mapValue = getCharOrder(ch); if (mapValue != RBCollationTables.UNMAPPED) { valueList[j++] = mapValue; } else { // can't find it in the table, will be filled in by commit(). valueList[j++] = CHARINDEX + ch; } } if (j < valueList.length) { //we had at least one supplementary character, the size of valueList //is bigger than it really needs... int[] tmpBuf = new int[j]; while (--j >= 0) { tmpBuf[j] = valueList[j]; } valueList = tmpBuf; } // Add the expanding char list into the expansion table. int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size(); expandTable.addElement(valueList); return tableIndex;
private final void addOrder(int ch, int anOrder)
Adds a character and its designated order into the collation table.
// See if the char already has an order in the mapping table int order = mapping.elementAt(ch); if (order >= RBCollationTables.CONTRACTCHARINDEX) { // There's already an entry for this character that points to a contracting // character table. Instead of adding the character directly to the mapping // table, we must add it to the contract table instead. int length = 1; if (Character.isSupplementaryCodePoint(ch)) { length = Character.toChars(ch, keyBuf, 0); } else { keyBuf[0] = (char)ch; } addContractOrder(new String(keyBuf, 0, length), anOrder); } else { // add the entry to the mapping table, // the same later entry replaces the previous one mapping.setElementAt(ch, anOrder); }
public void build(java.lang.String pattern, int decmp)
Create a table-based collation object with the given rules. This is the main function that actually builds the tables and stores them back in the RBCollationTables object. It is called ONLY by the RBCollationTables constructor.
see
java.util.RuleBasedCollator#RuleBasedCollator
exception
ParseException If the rules format is incorrect.
boolean isSource = true; int i = 0; String expChars; String groupChars; if (pattern.length() == 0) throw new ParseException("Build rules empty.", 0); // This array maps Unicode characters to their collation ordering mapping = new UCompactIntArray((int)RBCollationTables.UNMAPPED); // Normalize the build rules. Find occurances of all decomposed characters // and normalize the rules before feeding into the builder. By "normalize", // we mean that all precomposed Unicode characters must be converted into // a base character and one or more combining characters (such as accents). // When there are multiple combining characters attached to a base character, // the combining characters must be in their canonical order // // sherman/Note: //(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose //hangual syllables to jamos, so we can actually just call decompose with //normalizer's IGNORE_HANGUL option turned on // //(2)just call the "special version" in NormalizerImpl directly //pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true); // //Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp); //pattern = Normalizer.normalize(pattern, mode, 0, true); pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern); // Build the merged collation entries // Since rules can be specified in any order in the string // (e.g. "c , C < d , D < e , E .... C < CH") // this splits all of the rules in the string out into separate // objects and then sorts them. In the above example, it merges the // "C < CH" rule in just before the "C < D" rule. // mPattern = new MergeCollation(pattern); int order = 0; // Now walk though each entry and add it to my own tables for (i = 0; i < mPattern.getCount(); ++i) { PatternEntry entry = mPattern.getItemAt(i); if (entry != null) { groupChars = entry.getChars(); if (groupChars.length() > 1) { switch(groupChars.charAt(groupChars.length()-1)) { case '@": frenchSec = true; groupChars = groupChars.substring(0, groupChars.length()-1); break; case '!": seAsianSwapping = true; groupChars = groupChars.substring(0, groupChars.length()-1); break; } } order = increment(entry.getStrength(), order); expChars = entry.getExtension(); if (expChars.length() != 0) { addExpandOrder(groupChars, expChars, order); } else if (groupChars.length() > 1) { char ch = groupChars.charAt(0); if (Character.isHighSurrogate(ch) && groupChars.length() == 2) { addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order); } else { addContractOrder(groupChars, order); } } else { char ch = groupChars.charAt(0); addOrder(ch, order); } } } addComposedChars(); commit(); mapping.compact(); /* System.out.println("mappingSize=" + mapping.getKSize()); for (int j = 0; j < 0xffff; j++) { int value = mapping.elementAt(j); if (value != RBCollationTables.UNMAPPED) System.out.println("index=" + Integer.toString(j, 16) + ", value=" + Integer.toString(value, 16)); } */ tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable, contractFlags, maxSecOrder, maxTerOrder);
private final void commit()
Look up for unmapped values in the expanded character table. When the expanding character tables are built by addExpandOrder, it doesn't know what the final ordering of each character in the expansion will be. Instead, it just puts the raw character code into the table, adding CHARINDEX as a flag. Now that we've finished building the mapping table, we can go back and look up that character to see what its real collation order is and stick that into the expansion table. That lets us avoid doing a two-stage lookup later.
if (expandTable != null) { for (int i = 0; i < expandTable.size(); i++) { int[] valueList = (int [])expandTable.elementAt(i); for (int j = 0; j < valueList.length; j++) { int order = valueList[j]; if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) { // found a expanding character that isn't filled in yet int ch = order - CHARINDEX; // Get the real values for the non-filled entry int realValue = getCharOrder(ch); if (realValue == RBCollationTables.UNMAPPED) { // The real value is still unmapped, maybe it's ignorable valueList[j] = IGNORABLEMASK & ch; } else { // just fill in the value valueList[j] = realValue; } } } } }
private final int getCharOrder(int ch)
int order = mapping.elementAt(ch); if (order >= RBCollationTables.CONTRACTCHARINDEX) { Vector groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX); EntryPair pair = (EntryPair)groupList.firstElement(); order = pair.value; } return order;
private int getContractOrder(java.lang.String groupChars)
If the given string has been specified as a contracting string in this collation table, return its ordering. Otherwise return UNMAPPED.
int result = RBCollationTables.UNMAPPED; if (contractTable != null) { int ch = groupChars.codePointAt(0); /* char ch0 = groupChars.charAt(0); int ch = Character.isHighSurrogate(ch0)? Character.toCodePoint(ch0, groupChars.charAt(1)):ch0; */ Vector entryTable = getContractValues(ch); if (entryTable != null) { int index = RBCollationTables.getEntry(entryTable, groupChars, true); if (index != RBCollationTables.UNMAPPED) { EntryPair pair = (EntryPair) entryTable.elementAt(index); result = pair.value; } } } return result;
private java.util.Vector getContractValues(int ch)
Get the entry of hash table of the contracting string in the collation table.
param
ch the starting character of the contracting string
int index = mapping.elementAt(ch); return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);
private java.util.Vector getContractValuesImpl(int index)
if (index >= 0) { return (Vector)contractTable.elementAt(index); } else // not found { return null; }
private final int increment(int aStrength, int lastValue)
Increment of the last order based on the comparison level.
switch(aStrength) { case Collator.PRIMARY: // increment priamry order and mask off secondary and tertiary difference lastValue += PRIMARYORDERINCREMENT; lastValue &= RBCollationTables.PRIMARYORDERMASK; isOverIgnore = true; break; case Collator.SECONDARY: // increment secondary order and mask off tertiary difference lastValue += SECONDARYORDERINCREMENT; lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY; // record max # of ignorable chars with secondary difference if (!isOverIgnore) maxSecOrder++; break; case Collator.TERTIARY: // increment tertiary order lastValue += TERTIARYORDERINCREMENT; // record max # of ignorable chars with tertiary difference if (!isOverIgnore) maxTerOrder++; break; } return lastValue;