SynonymMap.java (Apache Lucene 2.1.0)

File	Doc	Category	Size	Date	Package
SynonymMap.java	API Doc	Apache Lucene 2.1.0	14005	Wed Feb 14 10:46:24 GMT 2007	org.apache.lucene.index.memory
SynonymMap.java

package org.apache.lucene.index.memory;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

/**
 * Loads the <a target="_blank" 
 * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
 * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
 * into a thread-safe main-memory hash map that can be used for fast
 * high-frequency lookups of synonyms for any given (lowercase) word string.
 * <p>
 * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
 * There does not necessarily hold: A -> B, B -> C then A -> C.
 * <p>
 * Loading typically takes some 1.5 secs, so should be done only once per
 * (server) program execution, using a singleton pattern. Once loaded, a
 * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
 * A loaded default synonym map consumes about 10 MB main memory.
 * An instance is immutable, hence thread-safe.
 * <p>
 * This implementation borrows some ideas from the Lucene Syns2Index demo that 
 * Dave Spencer originally contributed to Lucene. Dave's approach
 * involved a persistent Lucene index which is suitable for occasional
 * lookups or very large synonym tables, but considered unsuitable for 
 * high-frequency lookups of medium size synonym tables.
 * <p>
 * Example Usage:
 * <pre>
 * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
 * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
 * for (int i = 0; i < words.length; i++) {
 *     String[] synonyms = map.getSynonyms(words[i]);
 *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
 * }
 * 
 * Example output:
 * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
 * woods:[forest, wood]
 * forest:[afforest, timber, timberland, wood, woodland, woods]
 * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
 * xxxx:[]
 * </pre>
 * 
 * @author whoschek.AT.lbl.DOT.gov
 * @see <a target="_blank"
 *      href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
 *      man page </a>
 * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
 */
public class SynonymMap {

  /** the index data; Map<String word, String[] synonyms> */
  private final HashMap table;
  
  private static final String[] EMPTY = new String[0];
  
  private static final boolean DEBUG = false;

  /**
   * Constructs an instance, loading WordNet synonym data from the given input
   * stream. Finally closes the stream. The words in the stream must be in
   * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
   * 
   * @param input
   *            the stream to read from (null indicates an empty synonym map)
   * @throws IOException
   *             if an error occured while reading the stream.
   */
  public SynonymMap(InputStream input) throws IOException {
    this.table = input == null ? new HashMap(0) : read(toByteArray(input));
  }
  
  /**
   * Returns the synonym set for the given word, sorted ascending.
   * 
   * @param word
   *            the word to lookup (must be in lowercase).
   * @return the synonyms; a set of zero or more words, sorted ascending, each
   *         word containing lowercase characters that satisfy
   *         <code>Character.isLetter()</code>.
   */
  public String[] getSynonyms(String word) {
    Object syns = table.get(word);
    if (syns == null) return EMPTY;
    if (syns instanceof String) return new String[] {(String) syns};
    
    String[] synonyms = (String[]) syns;
    String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
    System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
    return copy;
  }
  
  /**
   * Returns a String representation of the index data for debugging purposes.
   * 
   * @return a String representation
   */
  public String toString() {
    StringBuffer buf = new StringBuffer();
    Iterator iter = new TreeMap(table).keySet().iterator();
    int count = 0;
    int f0 = 0;
    int f1 = 0;
    int f2 = 0;
    int f3 = 0;
    
    while (iter.hasNext()) {
      String word = (String) iter.next();
      buf.append(word + ":");
      String[] synonyms = getSynonyms(word);
      buf.append(Arrays.asList(synonyms));
      buf.append("\n");
      count += synonyms.length;
      if (synonyms.length == 0) f0++;
      if (synonyms.length == 1) f1++;
      if (synonyms.length == 2) f2++;
      if (synonyms.length == 3) f3++;
    }
    
    buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
    return buf.toString();
  }
  
  /**
   * Analyzes/transforms the given word on input stream loading. This default implementation simply
   * lowercases the word. Override this method with a custom stemming
   * algorithm or similar, if desired.
   * 
   * @param word
   *            the word to analyze
   * @return the same word, or a different word (or null to indicate that the
   *         word should be ignored)
   */
  protected String analyze(String word) {
    return word.toLowerCase();
  }

  private static boolean isValid(String str) {
    for (int i=str.length(); --i >= 0; ) {
      if (!Character.isLetter(str.charAt(i))) return false;
    }
    return true;
  }

  private HashMap read(byte[] data) {
    int WORDS  = (int) (76401 / 0.7); // presizing
    int GROUPS = (int) (88022 / 0.7); // presizing
    HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
    HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
    HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>

    Charset charset = Charset.forName("UTF-8");
    int lastNum = -1;
    Integer lastGroup = null;
    int len = data.length;
    int i=0;
    
    while (i < len) { // until EOF
      /* Part A: Parse a line */
      
      // scan to beginning of group
      while (i < len && data[i] != '(') i++;
      if (i >= len) break; // EOF
      i++;
      
      // parse group
      int num = 0;
      while (i < len && data[i] != ',') {
        num = 10*num + (data[i] - 48);
        i++;
      }
      i++;
//      if (DEBUG) System.err.println("num="+ num);
      
      // scan to beginning of word
      while (i < len && data[i] != '\'') i++;
      i++;
  
      // scan to end of word
      int start = i;
      do {
        while (i < len && data[i] != '\'') i++;
        i++;
      } while (i < len && data[i] != ','); // word must end with "',"
      
      if (i >= len) break; // EOF
      String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
//      String word = new String(data, 0, start, i-start-1); // ASCII
      
      /*
       * Part B: ignore phrases (with spaces and hyphens) and
       * non-alphabetic words, and let user customize word (e.g. do some
       * stemming)
       */
      if (!isValid(word)) continue; // ignore
      word = analyze(word);
      if (word == null || word.length() == 0) continue; // ignore
      
      
      /* Part C: Add (group,word) to tables */
      
      // ensure compact string representation, minimizing memory overhead
      String w = (String) internedWords.get(word);
      if (w == null) {
        word = new String(word); // ensure compact string
        internedWords.put(word, word);
      } else {
        word = w;
      }
      
      Integer group = lastGroup;
      if (num != lastNum) {
        group = new Integer(num);
        lastGroup = group;
        lastNum = num;
      }
      
      // add word --> group
      ArrayList groups = (ArrayList) word2Groups.get(word);
      if (groups == null) {
        groups = new ArrayList(1);
        word2Groups.put(word, groups);
      }
      groups.add(group);

      // add group --> word
      ArrayList words = (ArrayList) group2Words.get(group);
      if (words == null) {
        words = new ArrayList(1);
        group2Words.put(group, words);
      } 
      words.add(word);
    }
    
    
    /* Part D: compute index data structure */
    HashMap word2Syns = createIndex(word2Groups, group2Words);    
        
    /* Part E: minimize memory consumption by a factor 3 (or so) */
//    if (true) return word2Syns;
    word2Groups = null; // help gc
    group2Words = null; // help gc    
    return optimize(word2Syns, internedWords);
  }
  
  private HashMap createIndex(Map word2Groups, Map group2Words) {
    HashMap word2Syns = new HashMap();
    Iterator iter = word2Groups.entrySet().iterator();
    
    while (iter.hasNext()) { // for each word
      Map.Entry entry = (Map.Entry) iter.next();
      ArrayList group = (ArrayList) entry.getValue();     
      String word = (String) entry.getKey();
      
//      HashSet synonyms = new HashSet();
      TreeSet synonyms = new TreeSet();
      for (int i=group.size(); --i >= 0; ) { // for each groupID of word
        ArrayList words = (ArrayList) group2Words.get(group.get(i));
        for (int j=words.size(); --j >= 0; ) { // add all words       
          Object synonym = words.get(j); // note that w and word are interned
          if (synonym != word) { // a word is implicitly it's own synonym
            synonyms.add(synonym);
          }
        }
      }

      int size = synonyms.size();
      if (size > 0) {
        String[] syns = new String[size];
        if (size == 1)  
          syns[0] = (String) synonyms.first();
        else
          synonyms.toArray(syns);
//        if (syns.length > 1) Arrays.sort(syns);
//        if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
        word2Syns.put(word, syns);
      }
    }
  
    return word2Syns;
  }

  private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
    if (DEBUG) {
      System.err.println("before gc");
      for (int i=0; i < 10; i++) System.gc();
      System.err.println("after gc");
    }
    
    // collect entries
    int len = 0;
    int size = word2Syns.size();
    String[][] allSynonyms = new String[size][];
    String[] words = new String[size];
    Iterator iter = word2Syns.entrySet().iterator();
    for (int j=0; j < size; j++) {
      Map.Entry entry = (Map.Entry) iter.next();
      allSynonyms[j] = (String[]) entry.getValue(); 
      words[j] = (String) entry.getKey();
      len += words[j].length();
    }
    
    // assemble large string containing all words
    StringBuffer buf = new StringBuffer(len);
    for (int j=0; j < size; j++) buf.append(words[j]);
    String allWords = new String(buf.toString()); // ensure compact string across JDK versions
    buf = null;
    
    // intern words at app level via memory-overlaid substrings
    for (int p=0, j=0; j < size; j++) {
      String word = words[j];
      internedWords.put(word, allWords.substring(p, p + word.length()));
      p += word.length();
    }
    
    // replace words with interned words
    for (int j=0; j < size; j++) {
      String[] syns = allSynonyms[j];
      for (int k=syns.length; --k >= 0; ) {
        syns[k] = (String) internedWords.get(syns[k]);
      }
      Object replacement = syns;
      if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
      word2Syns.remove(words[j]);
      word2Syns.put(internedWords.get(words[j]), replacement);
    }
    
    if (DEBUG) {
      words = null;
      allSynonyms = null;
      internedWords = null;
      allWords = null;
      System.err.println("before gc");
      for (int i=0; i < 10; i++) System.gc();
      System.err.println("after gc");
    }
    return word2Syns;
  }
  
  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
  private static byte[] toByteArray(InputStream input) throws IOException {
    try {
      // safe and fast even if input.available() behaves weird or buggy
      int len = Math.max(256, input.available());
      byte[] buffer = new byte[len];
      byte[] output = new byte[len];
      
      len = 0;
      int n;
      while ((n = input.read(buffer)) >= 0) {
        if (len + n > output.length) { // grow capacity
          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
          System.arraycopy(output, 0, tmp, 0, len);
          System.arraycopy(buffer, 0, tmp, len, n);
          buffer = output; // use larger buffer for future larger bulk reads
          output = tmp;
        } else {
          System.arraycopy(buffer, 0, output, len, n);
        }
        len += n;
      }

      if (len == output.length) return output;
      buffer = null; // help gc
      buffer = new byte[len];
      System.arraycopy(output, 0, buffer, 0, len);
      return buffer;
    } finally {
      if (input != null) input.close();
    }
  }
  
}