AnalyzingQueryParserpublic class AnalyzingQueryParser extends QueryParser Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
are also passed through the given analyzer, but wild card characters (like * )
don't get removed from the search terms.
Warning: This class should only be used with analyzers that do not use stopwords
or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
will turn Häuser into hau , but H?user will
become h?user when using this parser and thus no match would be found (i.e.
using this parser will be no improvement over QueryParser in such cases). |
Constructors Summary |
---|
public AnalyzingQueryParser(String field, Analyzer analyzer)Constructs a query parser.
super(field, analyzer);
|
Methods Summary |
---|
protected org.apache.lucene.search.Query | getFuzzyQuery(java.lang.String field, java.lang.String termStr, float minSimilarity)Called when parser parses an input term token that has the fuzzy suffix (~) appended.
Depending on analyzer and settings, a fuzzy term may (most probably will)
be lower-cased automatically. It will go through the default Analyzer.
Overrides super class, by passing terms through analyzer.
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
org.apache.lucene.analysis.Token t;
boolean multipleTokens = false;
try {
t = source.next();
multipleTokens = source.next() != null;
} catch (IOException e) {
t = null;
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (multipleTokens) {
throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
+ " - tokens were added");
}
return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity);
| protected org.apache.lucene.search.Query | getPrefixQuery(java.lang.String field, java.lang.String termStr)Called when parser parses an input term
token that uses prefix notation; that is, contains a single '*' wildcard
character as its last character. Since this is a special case
of generic wildcard term, and such a query can be optimized easily,
this usually results in a different query object.
Depending on analyzer and settings, a prefix term may (most probably will)
be lower-cased automatically. It will go through the default Analyzer.
Overrides super class, by passing terms through analyzer.
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
List tlist = new ArrayList();
org.apache.lucene.analysis.Token t;
while (true) {
try {
t = source.next();
} catch (IOException e) {
t = null;
}
if (t == null) {
break;
}
tlist.add(t.termText());
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (tlist.size() == 1) {
return super.getPrefixQuery(field, (String) tlist.get(0));
} else {
/* this means that the analyzer used consumed the only token we had,
* and we can't build a PrefixQuery */
throw new ParseException("Cannot build PrefixQuery with analyzer "
+ getAnalyzer().getClass() + " - token was consumed");
}
| protected org.apache.lucene.search.Query | getRangeQuery(java.lang.String field, java.lang.String part1, java.lang.String part2, boolean inclusive)Overrides super class, by passing terms through analyzer.
// get Analyzer from superclass and tokenize the terms
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
org.apache.lucene.analysis.Token t;
boolean multipleTokens = false;
// part1
try {
t = source.next();
if (t != null) {
part1 = t.termText();
}
multipleTokens = source.next() != null;
} catch (IOException e) {
t = null;
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (multipleTokens) {
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+ " - tokens were added to part1");
}
source = getAnalyzer().tokenStream(field, new StringReader(part2));
// part2
try {
t = source.next();
if (t != null) {
part2 = t.termText();
}
multipleTokens = source.next() != null;
} catch (IOException e) {
t = null;
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (multipleTokens) {
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
+ " - tokens were added to part2");
}
return super.getRangeQuery(field, part1, part2, inclusive);
| protected org.apache.lucene.search.Query | getWildcardQuery(java.lang.String field, java.lang.String termStr)Called when parser
parses an input term token that contains one or more wildcard
characters (like * ), but is not a prefix term token (one
that has just a single * character at the end).
Example: will be called for H?user or for H*user
but not for *user .
Depending on analyzer and settings, a wildcard term may (most probably will)
be lower-cased automatically. It will go through the default Analyzer.
Overrides super class, by passing terms through analyzer.
List tlist = new ArrayList();
List wlist = new ArrayList();
/* somewhat a hack: find/store wildcard chars
* in order to put them back after analyzing */
boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
StringBuffer tmpBuffer = new StringBuffer();
char[] chars = termStr.toCharArray();
for (int i = 0; i < termStr.length(); i++) {
if (chars[i] == '?" || chars[i] == '*") {
if (isWithinToken) {
tlist.add(tmpBuffer.toString());
tmpBuffer.setLength(0);
}
isWithinToken = false;
} else {
if (!isWithinToken) {
wlist.add(tmpBuffer.toString());
tmpBuffer.setLength(0);
}
isWithinToken = true;
}
tmpBuffer.append(chars[i]);
}
if (isWithinToken) {
tlist.add(tmpBuffer.toString());
} else {
wlist.add(tmpBuffer.toString());
}
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
org.apache.lucene.analysis.Token t;
int countTokens = 0;
while (true) {
try {
t = source.next();
} catch (IOException e) {
t = null;
}
if (t == null) {
break;
}
if (!"".equals(t.termText())) {
try {
tlist.set(countTokens++, t.termText());
} catch (IndexOutOfBoundsException ioobe) {
countTokens = -1;
}
}
}
try {
source.close();
} catch (IOException e) {
// ignore
}
if (countTokens != tlist.size()) {
/* this means that the analyzer used either added or consumed
* (common for a stemmer) tokens, and we can't build a WildcardQuery */
throw new ParseException("Cannot build WildcardQuery with analyzer "
+ getAnalyzer().getClass() + " - tokens added or lost");
}
if (tlist.size() == 0) {
return null;
} else if (tlist.size() == 1) {
if (wlist != null && wlist.size() == 1) {
/* if wlist contains one wildcard, it must be at the end, because:
* 1) wildcards are not allowed in 1st position of a term by QueryParser
* 2) if wildcard was *not* in end, there would be *two* or more tokens */
return super.getWildcardQuery(field, (String) tlist.get(0)
+ (((String) wlist.get(0)).toString()));
} else {
/* we should never get here! if so, this method was called
* with a termStr containing no wildcard ... */
throw new IllegalArgumentException("getWildcardQuery called without wildcard");
}
} else {
/* the term was tokenized, let's rebuild to one token
* with wildcards put back in postion */
StringBuffer sb = new StringBuffer();
for (int i = 0; i < tlist.size(); i++) {
sb.append((String) tlist.get(i));
if (wlist != null && wlist.size() > i) {
sb.append((String) wlist.get(i));
}
}
return super.getWildcardQuery(field, sb.toString());
}
|
|