SynonymMappublic class SynonymMap extends Object Loads the WordNet prolog file wn_s.pl
into a thread-safe main-memory hash map that can be used for fast
high-frequency lookups of synonyms for any given (lowercase) word string.
There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
There does not necessarily hold: A -> B, B -> C then A -> C.
Loading typically takes some 1.5 secs, so should be done only once per
(server) program execution, using a singleton pattern. Once loaded, a
synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
A loaded default synonym map consumes about 10 MB main memory.
An instance is immutable, hence thread-safe.
This implementation borrows some ideas from the Lucene Syns2Index demo that
Dave Spencer originally contributed to Lucene. Dave's approach
involved a persistent Lucene index which is suitable for occasional
lookups or very large synonym tables, but considered unsuitable for
high-frequency lookups of medium size synonym tables.
Example Usage:
String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
for (int i = 0; i < words.length; i++) {
String[] synonyms = map.getSynonyms(words[i]);
System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
}
Example output:
hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
woods:[forest, wood]
forest:[afforest, timber, timberland, wood, woodland, woods]
wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
xxxx:[]
|
Fields Summary |
---|
private final HashMap | tablethe index data; Map | private static final String[] | EMPTY | private static final boolean | DEBUG |
Constructors Summary |
---|
public SynonymMap(InputStream input)Constructs an instance, loading WordNet synonym data from the given input
stream. Finally closes the stream. The words in the stream must be in
UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
|
Methods Summary |
---|
protected java.lang.String | analyze(java.lang.String word)Analyzes/transforms the given word on input stream loading. This default implementation simply
lowercases the word. Override this method with a custom stemming
algorithm or similar, if desired.
return word.toLowerCase();
| private java.util.HashMap | createIndex(java.util.Map word2Groups, java.util.Map group2Words)
HashMap word2Syns = new HashMap();
Iterator iter = word2Groups.entrySet().iterator();
while (iter.hasNext()) { // for each word
Map.Entry entry = (Map.Entry) iter.next();
ArrayList group = (ArrayList) entry.getValue();
String word = (String) entry.getKey();
// HashSet synonyms = new HashSet();
TreeSet synonyms = new TreeSet();
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
ArrayList words = (ArrayList) group2Words.get(group.get(i));
for (int j=words.size(); --j >= 0; ) { // add all words
Object synonym = words.get(j); // note that w and word are interned
if (synonym != word) { // a word is implicitly it's own synonym
synonyms.add(synonym);
}
}
}
int size = synonyms.size();
if (size > 0) {
String[] syns = new String[size];
if (size == 1)
syns[0] = (String) synonyms.first();
else
synonyms.toArray(syns);
// if (syns.length > 1) Arrays.sort(syns);
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
word2Syns.put(word, syns);
}
}
return word2Syns;
| public java.lang.String[] | getSynonyms(java.lang.String word)Returns the synonym set for the given word, sorted ascending.
Object syns = table.get(word);
if (syns == null) return EMPTY;
if (syns instanceof String) return new String[] {(String) syns};
String[] synonyms = (String[]) syns;
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
return copy;
| private static boolean | isValid(java.lang.String str)
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
return true;
| private java.util.HashMap | optimize(java.util.HashMap word2Syns, java.util.HashMap internedWords)
if (DEBUG) {
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
// collect entries
int len = 0;
int size = word2Syns.size();
String[][] allSynonyms = new String[size][];
String[] words = new String[size];
Iterator iter = word2Syns.entrySet().iterator();
for (int j=0; j < size; j++) {
Map.Entry entry = (Map.Entry) iter.next();
allSynonyms[j] = (String[]) entry.getValue();
words[j] = (String) entry.getKey();
len += words[j].length();
}
// assemble large string containing all words
StringBuffer buf = new StringBuffer(len);
for (int j=0; j < size; j++) buf.append(words[j]);
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
buf = null;
// intern words at app level via memory-overlaid substrings
for (int p=0, j=0; j < size; j++) {
String word = words[j];
internedWords.put(word, allWords.substring(p, p + word.length()));
p += word.length();
}
// replace words with interned words
for (int j=0; j < size; j++) {
String[] syns = allSynonyms[j];
for (int k=syns.length; --k >= 0; ) {
syns[k] = (String) internedWords.get(syns[k]);
}
Object replacement = syns;
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
word2Syns.remove(words[j]);
word2Syns.put(internedWords.get(words[j]), replacement);
}
if (DEBUG) {
words = null;
allSynonyms = null;
internedWords = null;
allWords = null;
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
return word2Syns;
| private java.util.HashMap | read(byte[] data)
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
Charset charset = Charset.forName("UTF-8");
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
while (i < len) { // until EOF
/* Part A: Parse a line */
// scan to beginning of group
while (i < len && data[i] != '(") i++;
if (i >= len) break; // EOF
i++;
// parse group
int num = 0;
while (i < len && data[i] != ',") {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// scan to beginning of word
while (i < len && data[i] != '\'") i++;
i++;
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'") i++;
i++;
} while (i < len && data[i] != ',"); // word must end with "',"
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/* Part C: Add (group,word) to tables */
// ensure compact string representation, minimizing memory overhead
String w = (String) internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
Integer group = lastGroup;
if (num != lastNum) {
group = new Integer(num);
lastGroup = group;
lastNum = num;
}
// add word --> group
ArrayList groups = (ArrayList) word2Groups.get(word);
if (groups == null) {
groups = new ArrayList(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add group --> word
ArrayList words = (ArrayList) group2Words.get(group);
if (words == null) {
words = new ArrayList(1);
group2Words.put(group, words);
}
words.add(word);
}
/* Part D: compute index data structure */
HashMap word2Syns = createIndex(word2Groups, group2Words);
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
group2Words = null; // help gc
return optimize(word2Syns, internedWords);
| private static byte[] | toByteArray(java.io.InputStream input)
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
| public java.lang.String | toString()Returns a String representation of the index data for debugging purposes.
StringBuffer buf = new StringBuffer();
Iterator iter = new TreeMap(table).keySet().iterator();
int count = 0;
int f0 = 0;
int f1 = 0;
int f2 = 0;
int f3 = 0;
while (iter.hasNext()) {
String word = (String) iter.next();
buf.append(word + ":");
String[] synonyms = getSynonyms(word);
buf.append(Arrays.asList(synonyms));
buf.append("\n");
count += synonyms.length;
if (synonyms.length == 0) f0++;
if (synonyms.length == 1) f1++;
if (synonyms.length == 2) f2++;
if (synonyms.length == 3) f3++;
}
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
return buf.toString();
|
|