File Doc Category Size Date Package
SynonymMap.java API Doc Apache Lucene 1.9 13032 Mon Feb 20 09:18:36 GMT 2006 org.apache.lucene.index.memory

SynonymMap

java.lang.Object

public class SynonymMap extends Object

Loads the WordNet prolog file wn_s.pl into a thread-safe main-memory hash map that can be used for fast high-frequncy lookups of synonyms for any given (lowercase) word string.

There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). There does not necessary hold: A -> B, B -> C then A -> C.

Loading typically takes some 1.5 secs, so should be done only once per (server) program execution, using a singleton pattern. Once loaded, a synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). A loaded default synonym map consumes about 10 MB main memory. An instance is immutable, hence thread-safe.

This implementation borrows some ideas from the Lucene Syns2Index demo that Dave Spencer originally contributed to Lucene. Dave's approach involved a persistent Lucene index which is suitable for occasional lookups or very large synonym tables, but considered unsuitable for high-frequency lookups of medium size synonym tables.

Example Usage:

String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
for (int i = 0; i < words.length; i++) {
String[] synonyms = map.getSynonyms(words[i]);
System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
}

Example output:
hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
woods:[forest, wood]
forest:[afforest, timber, timberland, wood, woodland, woods]
wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
xxxx:[]

author: whoschek.AT.lbl.DOT.gov
see: prologdb man page
see: Dave's synonym demo site

Fields Summary
private final HashMap
table
the index data; Map
private static final String[]
EMPTY
private static final boolean
DEBUG
Constructors Summary
public SynonymMap(InputStream input)
Constructs an instance, loading WordNet synonym data from the given input stream. Finally closes the stream. The words in the stream must be in UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
param
input the stream to read from (null indicates an empty synonym map)
throws
IOException if an error occured while reading the stream.
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
Methods Summary
protected java.lang.String analyze(java.lang.String word)
Analyzes/transforms the given word on input stream loading. This default implementation simply lowercases the word. Override this method with a custom stemming algorithm or similar, if desired.
param
word the word to analyze
return
the same word, or a different word (or null to indicate that the word should be ignored)
return word.toLowerCase();
private java.util.HashMap createIndex(java.util.Map word2Groups, java.util.Map group2Words)
HashMap word2Syns = new HashMap(); Iterator iter = word2Groups.entrySet().iterator(); while (iter.hasNext()) { // for each word Map.Entry entry = (Map.Entry) iter.next(); ArrayList group = (ArrayList) entry.getValue(); String word = (String) entry.getKey(); // HashSet synonyms = new HashSet(); TreeSet synonyms = new TreeSet(); for (int i=group.size(); --i >= 0; ) { // for each groupID of word ArrayList words = (ArrayList) group2Words.get(group.get(i)); for (int j=words.size(); --j >= 0; ) { // add all words Object synonym = words.get(j); // note that w and word are interned if (synonym != word) { // a word is implicitly it's own synonym synonyms.add(synonym); } } } int size = synonyms.size(); if (size > 0) { String[] syns = new String[size]; if (size == 1) syns[0] = (String) synonyms.first(); else synonyms.toArray(syns); // if (syns.length > 1) Arrays.sort(syns); // if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); word2Syns.put(word, syns); } } return word2Syns;
public java.lang.String[] getSynonyms(java.lang.String word)
Returns the synonym set for the given word, sorted ascending.
param
word the word to lookup (must be in lowercase).
return
the synonyms; a set of zero or more words, sorted ascending, each word containing lowercase characters that satisfy Character.isLetter().
Object syns = table.get(word); if (syns == null) return EMPTY; if (syns instanceof String) return new String[] {(String) syns}; String[] synonyms = (String[]) syns; String[] copy = new String[synonyms.length]; // copy for guaranteed immutability System.arraycopy(synonyms, 0, copy, 0, synonyms.length); return copy;
private static boolean isValid(java.lang.String str)
for (int i=str.length(); --i >= 0; ) { if (!Character.isLetter(str.charAt(i))) return false; } return true;
private java.util.HashMap optimize(java.util.HashMap word2Syns, java.util.HashMap internedWords)
if (DEBUG) { System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } // collect entries int len = 0; int size = word2Syns.size(); String[][] allSynonyms = new String[size][]; String[] words = new String[size]; Iterator iter = word2Syns.entrySet().iterator(); for (int j=0; j < size; j++) { Map.Entry entry = (Map.Entry) iter.next(); allSynonyms[j] = (String[]) entry.getValue(); words[j] = (String) entry.getKey(); len += words[j].length(); } // assemble large string containing all words StringBuffer buf = new StringBuffer(len); for (int j=0; j < size; j++) buf.append(words[j]); String allWords = new String(buf.toString()); // ensure compact string across JDK versions buf = null; // intern words at app level via memory-overlaid substrings for (int p=0, j=0; j < size; j++) { String word = words[j]; internedWords.put(word, allWords.substring(p, p + word.length())); p += word.length(); } // replace words with interned words for (int j=0; j < size; j++) { String[] syns = allSynonyms[j]; for (int k=syns.length; --k >= 0; ) { syns[k] = (String) internedWords.get(syns[k]); } Object replacement = syns; if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more word2Syns.remove(words[j]); word2Syns.put(internedWords.get(words[j]), replacement); } if (DEBUG) { words = null; allSynonyms = null; internedWords = null; allWords = null; System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } return word2Syns;
private java.util.HashMap read(byte[] data)
int WORDS = (int) (76401 / 0.7); // presizing int GROUPS = (int) (88022 / 0.7); // presizing HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups> HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words> HashMap internedWords = new HashMap(WORDS);// Map<String word, String word> Charset charset = Charset.forName("UTF-8"); int lastNum = -1; Integer lastGroup = null; int len = data.length; int i=0; while (i < len) { // until EOF /* Part A: Parse a line */ // scan to beginning of group while (i < len && data[i] != '(") i++; if (i >= len) break; // EOF i++; // parse group int num = 0; while (i < len && data[i] != ',") { num = 10*num + (data[i] - 48); i++; } i++; // if (DEBUG) System.err.println("num="+ num); // scan to beginning of word while (i < len && data[i] != '\'") i++; i++; // scan to end of word int start = i; do { while (i < len && data[i] != '\'") i++; i++; } while (i < len && data[i] != ',"); // word must end with "'," if (i >= len) break; // EOF String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); // String word = new String(data, 0, start, i-start-1); // ASCII /* * Part B: ignore phrases (with spaces and hyphens) and * non-alphabetic words, and let user customize word (e.g. do some * stemming) */ if (!isValid(word)) continue; // ignore word = analyze(word); if (word == null || word.length() == 0) continue; // ignore /* Part C: Add (group,word) to tables */ // ensure compact string representation, minimizing memory overhead String w = (String) internedWords.get(word); if (w == null) { word = new String(word); // ensure compact string internedWords.put(word, word); } else { word = w; } Integer group = lastGroup; if (num != lastNum) { group = new Integer(num); lastGroup = group; lastNum = num; } // add word --> group ArrayList groups = (ArrayList) word2Groups.get(word); if (groups == null) { groups = new ArrayList(1); word2Groups.put(word, groups); } groups.add(group); // add group --> word ArrayList words = (ArrayList) group2Words.get(group); if (words == null) { words = new ArrayList(1); group2Words.put(group, words); } words.add(word); } /* Part D: compute index data structure */ HashMap word2Syns = createIndex(word2Groups, group2Words); /* Part E: minimize memory consumption by a factor 3 (or so) */ // if (true) return word2Syns; word2Groups = null; // help gc group2Words = null; // help gc return optimize(word2Syns, internedWords);
private static byte[] toByteArray(java.io.InputStream input)
try { // safe and fast even if input.available() behaves weird or buggy int len = Math.max(256, input.available()); byte[] buffer = new byte[len]; byte[] output = new byte[len]; len = 0; int n; while ((n = input.read(buffer)) >= 0) { if (len + n > output.length) { // grow capacity byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; System.arraycopy(output, 0, tmp, 0, len); System.arraycopy(buffer, 0, tmp, len, n); buffer = output; // use larger buffer for future larger bulk reads output = tmp; } else { System.arraycopy(buffer, 0, output, len, n); } len += n; } if (len == output.length) return output; buffer = null; // help gc buffer = new byte[len]; System.arraycopy(output, 0, buffer, 0, len); return buffer; } finally { if (input != null) input.close(); }
public java.lang.String toString()
Returns a String representation of the index data for debugging purposes.
return
a String representation
StringBuffer buf = new StringBuffer(); Iterator iter = new TreeMap(table).keySet().iterator(); int count = 0; int f0 = 0; int f1 = 0; int f2 = 0; int f3 = 0; while (iter.hasNext()) { String word = (String) iter.next(); buf.append(word + ":"); String[] synonyms = getSynonyms(word); buf.append(Arrays.asList(synonyms)); buf.append("\n"); count += synonyms.length; if (synonyms.length == 0) f0++; if (synonyms.length == 1) f1++; if (synonyms.length == 2) f2++; if (synonyms.length == 3) f3++; } buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); return buf.toString();