FileDocCategorySizeDatePackage
SweetSpotSimilarity.javaAPI DocApache Lucene 2.0.06212Fri May 26 09:53:52 BST 2006org.apache.lucene.misc

SweetSpotSimilarity.java

/**
 * Copyright 2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.misc;

import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity;

import java.util.Map;
import java.util.HashMap;

/**
 * A similarity with a lengthNorm that provides for a "platuea" of
 * equally good lengths, and tf helper functions.
 *
 * <p>
 * For lengthNorm, A global min/max can be specified to define the
 * platuea of lengths that should all have a norm of 1.0.
 * Below the min, and above the max the lengthNorm drops off in a
 * sqrt function.
 * </p>
 * <p>
 * A per field min/max can be specified if different fields have
 * different sweet spots.
 * </p>
 *
 * <p>
 * For tf, baselineTf and hyperbolicTf functions are provided, which
 * subclasses can choose between.
 * </p>
 *
 */
public class SweetSpotSimilarity extends DefaultSimilarity {

  private int ln_min = 1;
  private int ln_max = 1;
  private float ln_steep = 0.5f;

  private Map ln_mins = new HashMap(7);
  private Map ln_maxs = new HashMap(7);
  private Map ln_steeps = new HashMap(7);

  private float tf_base = 0.0f;
  private float tf_min = 0.0f;

  private float tf_hyper_min = 0.0f;
  private float tf_hyper_max = 2.0f;
  private double tf_hyper_base = 1.3d;
  private float tf_hyper_xoffset = 10.0f;
    
  public SweetSpotSimilarity() {
    super();
  }

  /**
   * Sets the baseline and minimum function variables for baselineTf
   *
   * @see #baselineTf
   */
  public void setBaselineTfFactors(float base, float min) {
    tf_min = min;
    tf_base = base;
  }
  
  /**
   * Sets the function variables for the hyperbolicTf functions
   *
   * @param min the minimum tf value to ever be returned (default: 0.0)
   * @param max the maximum tf value to ever be returned (default: 2.0)
   * @param base the base value to be used in the exponential for the hyperbolic function (default: e)
   * @param xoffset the midpoint of the hyperbolic function (default: 10.0)
   * @see #hyperbolicTf
   */
  public void setHyperbolicTfFactors(float min, float max,
                                     double base, float xoffset) {
    tf_hyper_min = min;
    tf_hyper_max = max;
    tf_hyper_base = base;
    tf_hyper_xoffset = xoffset;
  }
    
  /**
   * Sets the default function variables used by lengthNorm when no field
   * specifc variables have been set.
   *
   * @see #lengthNorm
   */
  public void setLengthNormFactors(int min, int max, float steepness) {
    this.ln_min = min;
    this.ln_max = max;
    this.ln_steep = steepness;
  }

  /**
   * Sets the function variables used by lengthNorm for a specific named field
   *
   * @see #lengthNorm
   */
  public void setLengthNormFactors(String field, int min, int max,
                                   float steepness) {
    ln_mins.put(field, new Integer(min));
    ln_maxs.put(field, new Integer(max));
    ln_steeps.put(field, new Float(steepness));
  }
    
  /**
   * Implimented as:
   * <code>
   * 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
   * </code>
   *
   * <p>
   * This degrades to <code>1/sqrt(x)</code> when min and max are both 1 and
   * steepness is 0.5
   * </p>
   *
   * <p>
   * :TODO: potential optimiation is to just flat out return 1.0f if numTerms
   * is between min and max.
   * </p>
   *
   * @see #setLengthNormFactors
   */
  public float lengthNorm(String fieldName, int numTerms) {
    int l = ln_min;
    int h = ln_max;
    float s = ln_steep;
  
    if (ln_mins.containsKey(fieldName)) {
      l = ((Number)ln_mins.get(fieldName)).intValue();
    }
    if (ln_maxs.containsKey(fieldName)) {
      h = ((Number)ln_maxs.get(fieldName)).intValue();
    }
    if (ln_steeps.containsKey(fieldName)) {
      s = ((Number)ln_steeps.get(fieldName)).floatValue();
    }
  
    return (float)
      (1.0f /
       Math.sqrt
       (
        (
         s *
         (float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l))
         )
        + 1.0f
        )
       );
  }

  /**
   * Delegates to baselineTf
   *
   * @see #baselineTf
   */
  public float tf(int freq) {
    return baselineTf(freq);
  }
  
  /**
   * Implimented as:
   * <code>
   *  (x <= min) ? base : sqrt(x+(base**2)-min)
   * </code>
   * ...but with a special case check for 0.
   * <p>
   * This degrates to <code>sqrt(x)</code> when min and base are both 0
   * </p>
   *
   * @see #setBaselineTfFactors
   */
  public float baselineTf(float freq) {

    if (0.0f == freq) return 0.0f;
  
    return (freq <= tf_min)
      ? tf_base
      : (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min);
  }


    
  /**
   * Uses a hyperbolic tangent function that allows for a hard max...
   *
   * <code>
   * tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
   * </code>
   *
   * <p>
   * This code is provided as a convincience for subclasses that want
   * to use a hyperbolic tf function.
   * </p>
   *
   * @see #setHyperbolicTfFactors
   */
  public float hyperbolicTf(float freq) {
    if (0.0f == freq) return 0.0f;

    final float min = tf_hyper_min;
    final float max = tf_hyper_max;
    final double base = tf_hyper_base;
    final float xoffset = tf_hyper_xoffset;
    final double x = (double)(freq - xoffset);
  
    final float result = min +
      (float)(
              (max-min) / 2.0f
              *
              (
               ( ( Math.pow(base,x) - Math.pow(base,-x) )
                 / ( Math.pow(base,x) + Math.pow(base,-x) )
                 )
               + 1.0d
               )
              );

    return Float.isNaN(result) ? max : result;
    
  }

}