FileDocCategorySizeDatePackage
SynonymMap.javaAPI DocApache Lucene 1.913032Mon Feb 20 09:18:36 GMT 2006org.apache.lucene.index.memory

SynonymMap

public class SynonymMap extends Object
Loads the WordNet prolog file wn_s.pl into a thread-safe main-memory hash map that can be used for fast high-frequncy lookups of synonyms for any given (lowercase) word string.

There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). There does not necessary hold: A -> B, B -> C then A -> C.

Loading typically takes some 1.5 secs, so should be done only once per (server) program execution, using a singleton pattern. Once loaded, a synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). A loaded default synonym map consumes about 10 MB main memory. An instance is immutable, hence thread-safe.

This implementation borrows some ideas from the Lucene Syns2Index demo that Dave Spencer originally contributed to Lucene. Dave's approach involved a persistent Lucene index which is suitable for occasional lookups or very large synonym tables, but considered unsuitable for high-frequency lookups of medium size synonym tables.

Example Usage:

String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
for (int i = 0; i < words.length; i++) {
String[] synonyms = map.getSynonyms(words[i]);
System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
}

Example output:
hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
woods:[forest, wood]
forest:[afforest, timber, timberland, wood, woodland, woods]
wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
xxxx:[]
author
whoschek.AT.lbl.DOT.gov
see
prologdb man page
see
Dave's synonym demo site

Fields Summary
private final HashMap
table
the index data; Map
private static final String[]
EMPTY
private static final boolean
DEBUG
Constructors Summary
public SynonymMap(InputStream input)
Constructs an instance, loading WordNet synonym data from the given input stream. Finally closes the stream. The words in the stream must be in UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).

param
input the stream to read from (null indicates an empty synonym map)
throws
IOException if an error occured while reading the stream.


	                                                                                 	 
	     
		this.table = input == null ? new HashMap(0) : read(toByteArray(input));
	
Methods Summary
protected java.lang.Stringanalyze(java.lang.String word)
Analyzes/transforms the given word on input stream loading. This default implementation simply lowercases the word. Override this method with a custom stemming algorithm or similar, if desired.

param
word the word to analyze
return
the same word, or a different word (or null to indicate that the word should be ignored)

		return word.toLowerCase();
	
private java.util.HashMapcreateIndex(java.util.Map word2Groups, java.util.Map group2Words)

		HashMap word2Syns = new HashMap();
		Iterator iter = word2Groups.entrySet().iterator();
		
		while (iter.hasNext()) { // for each word
			Map.Entry entry = (Map.Entry) iter.next();
			ArrayList group = (ArrayList) entry.getValue();			
			String word = (String) entry.getKey();
			
//			HashSet synonyms = new HashSet();
			TreeSet synonyms = new TreeSet();
			for (int i=group.size(); --i >= 0; ) { // for each groupID of word
				ArrayList words = (ArrayList) group2Words.get(group.get(i));
				for (int j=words.size(); --j >= 0; ) { // add all words				
					Object synonym = words.get(j); // note that w and word are interned
					if (synonym != word) { // a word is implicitly it's own synonym
						synonyms.add(synonym);
					}
				}
			}

			int size = synonyms.size();
			if (size > 0) {
				String[] syns = new String[size];
				if (size == 1)  
					syns[0] = (String) synonyms.first();
				else
					synonyms.toArray(syns);
//				if (syns.length > 1) Arrays.sort(syns);
//				if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
				word2Syns.put(word, syns);
			}
		}
	
		return word2Syns;
	
public java.lang.String[]getSynonyms(java.lang.String word)
Returns the synonym set for the given word, sorted ascending.

param
word the word to lookup (must be in lowercase).
return
the synonyms; a set of zero or more words, sorted ascending, each word containing lowercase characters that satisfy Character.isLetter().

		Object syns = table.get(word);
		if (syns == null) return EMPTY;
		if (syns instanceof String) return new String[] {(String) syns};
		
		String[] synonyms = (String[]) syns;
		String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
		System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
		return copy;
	
private static booleanisValid(java.lang.String str)

		for (int i=str.length(); --i >= 0; ) {
			if (!Character.isLetter(str.charAt(i))) return false;
		}
		return true;
	
private java.util.HashMapoptimize(java.util.HashMap word2Syns, java.util.HashMap internedWords)

		if (DEBUG) {
			System.err.println("before gc");
			for (int i=0; i < 10; i++) System.gc();
			System.err.println("after gc");
		}
		
		// collect entries
		int len = 0;
		int size = word2Syns.size();
		String[][] allSynonyms = new String[size][];
		String[] words = new String[size];
		Iterator iter = word2Syns.entrySet().iterator();
		for (int j=0; j < size; j++) {
			Map.Entry entry = (Map.Entry) iter.next();
			allSynonyms[j] = (String[]) entry.getValue(); 
			words[j] = (String) entry.getKey();
			len += words[j].length();
		}
		
		// assemble large string containing all words
		StringBuffer buf = new StringBuffer(len);
		for (int j=0; j < size; j++) buf.append(words[j]);
		String allWords = new String(buf.toString()); // ensure compact string across JDK versions
		buf = null;
		
		// intern words at app level via memory-overlaid substrings
		for (int p=0, j=0; j < size; j++) {
			String word = words[j];
			internedWords.put(word, allWords.substring(p, p + word.length()));
			p += word.length();
		}
		
		// replace words with interned words
		for (int j=0; j < size; j++) {
			String[] syns = allSynonyms[j];
			for (int k=syns.length; --k >= 0; ) {
				syns[k] = (String) internedWords.get(syns[k]);
			}
			Object replacement = syns;
			if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
			word2Syns.remove(words[j]);
			word2Syns.put(internedWords.get(words[j]), replacement);
		}
		
		if (DEBUG) {
			words = null;
			allSynonyms = null;
			internedWords = null;
			allWords = null;
			System.err.println("before gc");
			for (int i=0; i < 10; i++) System.gc();
			System.err.println("after gc");
		}
		return word2Syns;
	
private java.util.HashMapread(byte[] data)

		int WORDS  = (int) (76401 / 0.7); // presizing
		int GROUPS = (int) (88022 / 0.7); // presizing
		HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
		HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
		HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>

		Charset charset = Charset.forName("UTF-8");
		int lastNum = -1;
		Integer lastGroup = null;
		int len = data.length;
		int i=0;
		
		while (i < len) { // until EOF
			/* Part A: Parse a line */
			
			// scan to beginning of group
			while (i < len && data[i] != '(") i++;
			if (i >= len) break; // EOF
			i++;
			
			// parse group
			int num = 0;
			while (i < len && data[i] != ',") {
				num = 10*num + (data[i] - 48);
				i++;
			}
			i++;
//			if (DEBUG) System.err.println("num="+ num);
			
			// scan to beginning of word
			while (i < len && data[i] != '\'") i++;
			i++;
	
			// scan to end of word
			int start = i;
			do {
				while (i < len && data[i] != '\'") i++;
				i++;
			} while (i < len && data[i] != ',"); // word must end with "',"
			
			if (i >= len) break; // EOF
			String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
//			String word = new String(data, 0, start, i-start-1); // ASCII
			
			/*
			 * Part B: ignore phrases (with spaces and hyphens) and
			 * non-alphabetic words, and let user customize word (e.g. do some
			 * stemming)
			 */
			if (!isValid(word)) continue; // ignore
			word = analyze(word);
			if (word == null || word.length() == 0) continue; // ignore
			
			
			/* Part C: Add (group,word) to tables */
			
			// ensure compact string representation, minimizing memory overhead
			String w = (String) internedWords.get(word);
			if (w == null) {
				word = new String(word); // ensure compact string
				internedWords.put(word, word);
			} else {
				word = w;
			}
			
			Integer group = lastGroup;
			if (num != lastNum) {
				group = new Integer(num);
				lastGroup = group;
				lastNum = num;
			}
			
			// add word --> group
			ArrayList groups = (ArrayList) word2Groups.get(word);
			if (groups == null) {
				groups = new ArrayList(1);
				word2Groups.put(word, groups);
			}
			groups.add(group);

			// add group --> word
			ArrayList words = (ArrayList) group2Words.get(group);
			if (words == null) {
				words = new ArrayList(1);
				group2Words.put(group, words);
			} 
			words.add(word);
		}
		
		
		/* Part D: compute index data structure */
		HashMap word2Syns = createIndex(word2Groups, group2Words);		
				
		/* Part E: minimize memory consumption by a factor 3 (or so) */
//		if (true) return word2Syns;
		word2Groups = null; // help gc
		group2Words = null; // help gc		
		return optimize(word2Syns, internedWords);
	
private static byte[]toByteArray(java.io.InputStream input)

		try {
			// safe and fast even if input.available() behaves weird or buggy
			int len = Math.max(256, input.available());
			byte[] buffer = new byte[len];
			byte[] output = new byte[len];
			
			len = 0;
			int n;
			while ((n = input.read(buffer)) >= 0) {
				if (len + n > output.length) { // grow capacity
					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
					System.arraycopy(output, 0, tmp, 0, len);
					System.arraycopy(buffer, 0, tmp, len, n);
					buffer = output; // use larger buffer for future larger bulk reads
					output = tmp;
				} else {
					System.arraycopy(buffer, 0, output, len, n);
				}
				len += n;
			}

			if (len == output.length) return output;
			buffer = null; // help gc
			buffer = new byte[len];
			System.arraycopy(output, 0, buffer, 0, len);
			return buffer;
		} finally {
			if (input != null) input.close();
		}
	
public java.lang.StringtoString()
Returns a String representation of the index data for debugging purposes.

return
a String representation

		StringBuffer buf = new StringBuffer();
		Iterator iter = new TreeMap(table).keySet().iterator();
		int count = 0;
		int f0 = 0;
		int f1 = 0;
		int f2 = 0;
		int f3 = 0;
		
		while (iter.hasNext()) {
			String word = (String) iter.next();
			buf.append(word + ":");
			String[] synonyms = getSynonyms(word);
			buf.append(Arrays.asList(synonyms));
			buf.append("\n");
			count += synonyms.length;
			if (synonyms.length == 0) f0++;
			if (synonyms.length == 1) f1++;
			if (synonyms.length == 2) f2++;
			if (synonyms.length == 3) f3++;
		}
		
		buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
		return buf.toString();