File Doc Category Size Date Package
StopFilter.java API Doc Apache Lucene 1.9 5459 Mon Feb 20 09:19:46 GMT 2006 org.apache.lucene.analysis

StopFilter

java.lang.Object
- org.apache.lucene.analysis.TokenStream
  - org.apache.lucene.analysis.TokenFilter

public final class StopFilter extends TokenFilter

Removes stop words from a token stream.

Fields Summary
private final Set
stopWords
private final boolean
ignoreCase
Constructors Summary
public StopFilter(TokenStream input, String[] stopWords)
Construct a token stream filtering the given input.
this(input, stopWords, false);
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase)
Constructs a filter which removes words from the input TokenStream that are named in the array of words.
super(in); this.ignoreCase = ignoreCase; this.stopWords = makeStopSet(stopWords, ignoreCase);
public StopFilter(TokenStream in, Hashtable stopTable)
Constructs a filter which removes words from the input TokenStream that are named in the Hashtable.
deprecated
Use {@link #StopFilter(TokenStream, Set)} instead
this(in, stopTable, false);
public StopFilter(TokenStream in, Hashtable stopTable, boolean ignoreCase)
Constructs a filter which removes words from the input TokenStream that are named in the Hashtable. If ignoreCase is true, all keys in the stopTable should already be lowercased.
deprecated
Use {@link #StopFilter(TokenStream, Set)} instead
this(in, stopTable.keySet(), ignoreCase);
public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
Construct a token stream filtering the given input.
param
input
param
stopWords The set of Stop Words, as Strings. If ignoreCase is true, all strings should be lower cased
param
ignoreCase -Ignore case when stopping. The stopWords set must be setup to contain only lower case words
super(input); this.ignoreCase = ignoreCase; this.stopWords = stopWords;
public StopFilter(TokenStream in, Set stopWords)
Constructs a filter which removes words from the input TokenStream that are named in the Set. It is crucial that an efficient Set implementation is used for maximum performance.
see
#makeStopSet(java.lang.String[])
this(in, stopWords, false);
Methods Summary
public static final java.util.Set makeStopSet(java.lang.String[] stopWords, boolean ignoreCase)
param
stopWords
param
ignoreCase If true, all words are lower cased first.
return
a Set containing the words
HashSet stopTable = new HashSet(stopWords.length); for (int i = 0; i < stopWords.length; i++) stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]); return stopTable;
public static final java.util.Set makeStopSet(java.lang.String[] stopWords)
Builds a Set from an array of stop words, appropriate for passing into the StopFilter constructor. This permits this stopWords construction to be cached once when an Analyzer is constructed.
see
#makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
return makeStopSet(stopWords, false);
public static final java.util.Hashtable makeStopTable(java.lang.String[] stopWords)
Builds a Hashtable from an array of stop words, appropriate for passing into the StopFilter constructor. This permits this table construction to be cached once when an Analyzer is constructed.
deprecated
Use {@link #makeStopSet(String[])} instead.
return makeStopTable(stopWords, false);
public static final java.util.Hashtable makeStopTable(java.lang.String[] stopWords, boolean ignoreCase)
Builds a Hashtable from an array of stop words, appropriate for passing into the StopFilter constructor. This permits this table construction to be cached once when an Analyzer is constructed.
deprecated
Use {@link #makeStopSet(java.lang.String[], boolean)} instead.
Hashtable stopTable = new Hashtable(stopWords.length); for (int i = 0; i < stopWords.length; i++) { String stopWord = ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]; stopTable.put(stopWord, stopWord); } return stopTable;
public final org.apache.lucene.analysis.Token next()
Returns the next input Token whose termText() is not a stop word.
// return the first non-stop word found for (Token token = input.next(); token != null; token = input.next()) { String termText = ignoreCase ? token.termText.toLowerCase() : token.termText; if (!stopWords.contains(termText)) return token; } // reached EOS -- return null return null;