PatternAnalyzerpublic class PatternAnalyzer extends Analyzer Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
{@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
(with behaviour identical to {@link String#split(String)}),
and that combines the functionality of
{@link org.apache.lucene.analysis.LetterTokenizer},
{@link org.apache.lucene.analysis.LowerCaseTokenizer},
{@link org.apache.lucene.analysis.WhitespaceTokenizer},
{@link org.apache.lucene.analysis.StopFilter} into a single efficient
multi-purpose class.
If you are unsure how exactly a regular expression should look like, consider
prototyping by simply trying various expressions on some test texts via
{@link String#split(String)}. Once you are satisfied, give that regex to
PatternAnalyzer. Also see Java Regular Expression Tutorial.
This class can be considerably faster than the "normal" Lucene tokenizers.
It can also serve as a building block in a compound Lucene
{@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
stemming example:
PatternAnalyzer pat = ...
TokenStream tokenStream = new SnowballFilter(
pat.tokenStream("content", "James is running round in the woods"),
"English"));
|
Fields Summary |
---|
public static final Pattern | NON_WORD_PATTERN"\\W+" ; Divides text at non-letters (!Character.isLetter(c)) | public static final Pattern | WHITESPACE_PATTERN"\\s+" ; Divides text at whitespaces (Character.isWhitespace(c)) | private static final Set | EXTENDED_ENGLISH_STOP_WORDS | public static final PatternAnalyzer | DEFAULT_ANALYZERA lower-casing word analyzer with English stop words (can be shared
freely across threads without harm); global per class loader. | public static final PatternAnalyzer | EXTENDED_ANALYZERA lower-casing word analyzer with extended English stop words
(can be shared freely across threads without harm); global per class
loader. The stop words are borrowed from
http://thomas.loc.gov/home/stopwords.html, see
http://thomas.loc.gov/home/all.about.inquery.html | private final Pattern | pattern | private final boolean | toLowerCase | private final Set | stopWords |
Constructors Summary |
---|
public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords)Constructs a new instance with the given parameters.
if (pattern == null)
throw new IllegalArgumentException("pattern must not be null");
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
if (stopWords != null && stopWords.size() == 0) stopWords = null;
this.pattern = pattern;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
|
Methods Summary |
---|
private static boolean | eq(java.lang.Object o1, java.lang.Object o2)equality where o1 and/or o2 can be null
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
| private static boolean | eqPattern(java.util.regex.Pattern p1, java.util.regex.Pattern p2)assumes p1 and p2 are not null
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
| public boolean | equals(java.lang.Object other)Indicates whether some other object is "equal to" this one.
if (this == other) return true;
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
if (other instanceof PatternAnalyzer) {
PatternAnalyzer p2 = (PatternAnalyzer) other;
return
toLowerCase == p2.toLowerCase &&
eqPattern(pattern, p2.pattern) &&
eq(stopWords, p2.stopWords);
}
return false;
| public int | hashCode()Returns a hash code value for the object.
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
int h = 1;
h = 31*h + pattern.pattern().hashCode();
h = 31*h + pattern.flags();
h = 31*h + (toLowerCase ? 1231 : 1237);
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
return h;
| private static java.util.Set | makeStopSet(java.lang.String[] stopWords)somewhat oversized to minimize hash collisions
Set stops = new HashSet(stopWords.length * 2, 0.3f);
stops.addAll(Arrays.asList(stopWords));
return stops;
// return Collections.unmodifiableSet(stops);
| private static java.lang.String | toString(java.io.Reader input)Reads until end-of-stream and returns all read chars, finally closes the stream.
try {
int len = 256;
char[] buffer = new char[len];
char[] output = new char[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
char[] tmp = new char[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
return new String(output, 0, len);
} finally {
if (input != null) input.close();
}
| public org.apache.lucene.analysis.TokenStream | tokenStream(java.lang.String fieldName, java.lang.String text)Creates a token stream that tokenizes the given string into token terms
(aka words).
// Ideally the Analyzer superclass should have a method with the same signature,
// with a default impl that simply delegates to the StringReader flavour.
if (text == null)
throw new IllegalArgumentException("text must not be null");
TokenStream stream;
if (pattern == NON_WORD_PATTERN) { // fast path
stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
}
else if (pattern == WHITESPACE_PATTERN) { // fast path
stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
}
else {
stream = new PatternTokenizer(text, pattern, toLowerCase);
if (stopWords != null) stream = new StopFilter(stream, stopWords);
}
return stream;
| public org.apache.lucene.analysis.TokenStream | tokenStream(java.lang.String fieldName, java.io.Reader reader)Creates a token stream that tokenizes all the text in the given Reader;
This implementation forwards to tokenStream(String, String) and is
less efficient than tokenStream(String, String) .
if (reader instanceof FastStringReader) { // fast path
return tokenStream(fieldName, ((FastStringReader)reader).getString());
}
try {
String text = toString(reader);
return tokenStream(fieldName, text);
} catch (IOException e) {
throw new RuntimeException(e);
}
|
|