Fields Summary |
---|
public static final int | DEFAULT_MAX_NUM_TOKENS_PARSEDDefault maximum number of tokens to parse in each example doc field that is not stored with TermVector support. |
public static final Analyzer | DEFAULT_ANALYZERDefault analyzer to parse source doc with. |
public static final int | DEFAULT_MIN_TERM_FREQIgnore terms with less than this frequency in the source doc. |
public static final int | DEFALT_MIN_DOC_FREQIgnore words which do not occur in at least this many docs. |
public static final boolean | DEFAULT_BOOSTBoost terms in query based on score. |
public static final String[] | DEFAULT_FIELD_NAMESDefault field names. Null is used to specify that the field names should be looked
up at runtime from the provided reader. |
public static final int | DEFAULT_MIN_WORD_LENGTHIgnore words less than this length or if 0 then this has no effect. |
public static final int | DEFAULT_MAX_WORD_LENGTHIgnore words greater than this length or if 0 then this has no effect. |
public static final Set | DEFAULT_STOP_WORDSDefault set of stopwords.
If null means to allow stop words. |
private Set | stopWordsCurrent set of stop words. |
public static final int | DEFAULT_MAX_QUERY_TERMSReturn a Query with no more than this many terms. |
private Analyzer | analyzerAnalyzer that will be used to parse the doc. |
private int | minTermFreqIgnore words less freqent that this. |
private int | minDocFreqIgnore words which do not occur in at least this many docs. |
private boolean | boostShould we apply a boost to the Query based on the scores? |
private String[] | fieldNamesField name we'll analyze. |
private int | maxNumTokensParsedThe maximum number of tokens to parse in each example doc field that is not stored with TermVector support |
private int | minWordLenIgnore words if less than this len. |
private int | maxWordLenIgnore words if greater than this len. |
private int | maxQueryTermsDon't return a query longer than this. |
private Similarity | similarityFor idf() calculations. |
private final IndexReader | irIndexReader to use |
Methods Summary |
---|
private void | addTermFrequencies(java.util.Map termFreqMap, org.apache.lucene.index.TermFreqVector vector)Adds terms and frequencies found in vector into the Map termFreqMap
String[] terms = vector.getTerms();
int freqs[]=vector.getTermFrequencies();
for (int j = 0; j < terms.length; j++) {
String term = terms[j];
if(isNoiseWord(term)){
continue;
}
// increment frequency
Int cnt = (Int) termFreqMap.get(term);
if (cnt == null) {
cnt=new Int();
termFreqMap.put(term, cnt);
cnt.x=freqs[j];
}
else {
cnt.x+=freqs[j];
}
}
|
private void | addTermFrequencies(java.io.Reader r, java.util.Map termFreqMap, java.lang.String fieldName)Adds term frequencies found by tokenizing text from reader into the Map words
TokenStream ts = analyzer.tokenStream(fieldName, r);
org.apache.lucene.analysis.Token token;
int tokenCount=0;
while ((token = ts.next()) != null) { // for every token
String word = token.termText();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
break;
}
if(isNoiseWord(word)){
continue;
}
// increment frequency
Int cnt = (Int) termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
}
else {
cnt.x++;
}
}
|
private org.apache.lucene.search.Query | createQuery(org.apache.lucene.util.PriorityQueue q)Create the More like query from a PriorityQueue
BooleanQuery query = new BooleanQuery();
Object cur;
int qterms = 0;
float bestScore = 0;
while (((cur = q.pop()) != null)) {
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
if (boost) {
if (qterms == 0) {
bestScore = ((Float) ar[2]).floatValue();
}
float myScore = ((Float) ar[2]).floatValue();
tq.setBoost(myScore / bestScore);
}
try {
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore) {
break;
}
qterms++;
if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
break;
}
}
return query;
|
private org.apache.lucene.util.PriorityQueue | createQueue(java.util.Map words)Create a PriorityQueue from a word->tf map.
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size()); // will order words by score
Iterator it = words.keySet().iterator();
while (it.hasNext()) { // for every word
String word = (String) it.next();
int tf = ((Int) words.get(word)).x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
// go through all the fields and find the largest document frequency
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++) {
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
// only really need 1st 3 entries, other ones are for troubleshooting
res.insert(new Object[]{word, // the word
topField, // the top field
new Float(score), // overall score
new Float(idf), // idf
new Integer(docFreq), // freq in all docs
new Integer(tf)
});
}
return res;
|
public java.lang.String | describeParams()Describe the parameters that control how the "more like this" query is formed.
StringBuffer sb = new StringBuffer();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : \"");
String delim = "";
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
return sb.toString();
|
public org.apache.lucene.analysis.Analyzer | getAnalyzer()Returns an analyzer that will be used to parse source doc with. The default analyzer
is the {@link #DEFAULT_ANALYZER}.
return analyzer;
|
public java.lang.String[] | getFieldNames()Returns the field names that will be used when generating the 'More Like This' query.
The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
return fieldNames;
|
public int | getMaxNumTokensParsed()
return maxNumTokensParsed;
|
public int | getMaxQueryTerms()Returns the maximum number of query terms that will be included in any generated query.
The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
return maxQueryTerms;
|
public int | getMaxWordLen()Returns the maximum word length above which words will be ignored. Set this to 0 for no
maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
return maxWordLen;
|
public int | getMinDocFreq()Returns the frequency at which words will be ignored which do not occur in at least this
many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}.
return minDocFreq;
|
public int | getMinTermFreq()Returns the frequency below which terms will be ignored in the source doc. The default
frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
return minTermFreq;
|
public int | getMinWordLen()Returns the minimum word length below which words will be ignored. Set this to 0 for no
minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
return minWordLen;
|
public java.util.Set | getStopWords()Get the current stop words being used.
return stopWords;
|
public boolean | isBoost()Returns whether to boost terms in query based on "score" or not. The default is
{@link #DEFAULT_BOOST}.
return boost;
|
private boolean | isNoiseWord(java.lang.String term)determines if the passed term is likely to be of interest in "more like" comparisons
int len = term.length();
if (minWordLen > 0 && len < minWordLen) {
return true;
}
if (maxWordLen > 0 && len > maxWordLen) {
return true;
}
if (stopWords != null && stopWords.contains( term)) {
return true;
}
return false;
|
public org.apache.lucene.search.Query | like(int docNum)Return a query that will return docs like the passed lucene document ID.
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
}
return createQuery(retrieveTerms(docNum));
|
public org.apache.lucene.search.Query | like(java.io.File f)Return a query that will return docs like the passed file.
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
}
return like(new FileReader(f));
|
public org.apache.lucene.search.Query | like(java.net.URL u)Return a query that will return docs like the passed URL.
return like(new InputStreamReader(u.openConnection().getInputStream()));
|
public org.apache.lucene.search.Query | like(java.io.InputStream is)Return a query that will return docs like the passed stream.
return like(new InputStreamReader(is));
|
public org.apache.lucene.search.Query | like(java.io.Reader r)Return a query that will return docs like the passed Reader.
return createQuery(retrieveTerms(r));
|
public static void | main(java.lang.String[] a)Test driver.
Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
String indexName = "localhost_index";
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
URL url = null;
for (int i = 0; i < a.length; i++) {
if (a[i].equals("-i")) {
indexName = a[++i];
}
else if (a[i].equals("-f")) {
fn = a[++i];
}
else if (a[i].equals("-url")) {
url = new URL(a[++i]);
}
}
PrintStream o = System.out;
IndexReader r = IndexReader.open(indexName);
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
MoreLikeThis mlt = new MoreLikeThis(r);
o.println("Query generation parameters:");
o.println(mlt.describeParams());
o.println();
Query query = null;
if (url != null) {
o.println("Parsing URL: " + url);
query = mlt.like(url);
}
else if (fn != null) {
o.println("Parsing file: " + fn);
query = mlt.like(new File(fn));
}
o.println("q: " + query);
o.println();
IndexSearcher searcher = new IndexSearcher(indexName);
Hits hits = searcher.search(query);
int len = hits.length();
o.println("found: " + len + " documents matching");
o.println();
for (int i = 0; i < Math.min(25, len); i++) {
Document d = hits.doc(i);
String summary = d.get( "summary");
o.println("score : " + hits.score(i));
o.println("url : " + d.get("url"));
o.println("\ttitle : " + d.get("title"));
if ( summary != null)
o.println("\tsummary: " + d.get("summary"));
o.println();
}
|
public java.lang.String[] | retrieveInterestingTerms(java.io.Reader r)Convenience routine to make it easy to return the most interesting words in a document.
More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
ArrayList al = new ArrayList( maxQueryTerms);
PriorityQueue pq = retrieveTerms( r);
Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((cur = pq.pop()) != null) && lim-- > 0) {
Object[] ar = (Object[]) cur;
al.add( ar[ 0]); // the 1st entry is the interesting word
}
String[] res = new String[ al.size()];
return (String[]) al.toArray( res);
|
private org.apache.lucene.util.PriorityQueue | retrieveTerms(int docNum)Find words for a more-like-this query former.
Map termFreqMap = new HashMap();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
// field does not store term vector info
if (vector == null) {
Document d=ir.document(docNum);
String text[]=d.getValues(fieldName);
if(text!=null)
{
for (int j = 0; j < text.length; j++) {
addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else {
addTermFrequencies(termFreqMap, vector);
}
}
return createQueue(termFreqMap);
|
public org.apache.lucene.util.PriorityQueue | retrieveTerms(java.io.Reader r)Find words for a more-like-this query former.
The result is a priority queue of arrays with one entry for every word in the document.
Each array has 6 elements.
The elements are:
- The word (String)
- The top field that this word comes from (String)
- The score for this word (Float)
- The IDF value (Float)
- The frequency of this word in the index (Integer)
- The frequency of this word in the source document (Integer)
This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
This method is exposed so that you can identify the "interesting words" in a document.
For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
Map words = new HashMap();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
addTermFrequencies(r, words, fieldName);
}
return createQueue(words);
|
public void | setAnalyzer(org.apache.lucene.analysis.Analyzer analyzer)Sets the analyzer to use. An analyzer is not required for generating a query with the
{@link #like(int)} method, all other 'like' methods require an analyzer.
this.analyzer = analyzer;
|
public void | setBoost(boolean boost)Sets whether to boost terms in query based on "score" or not.
this.boost = boost;
|
public void | setFieldNames(java.lang.String[] fieldNames)Sets the field names that will be used when generating the 'More Like This' query.
Set this to null for the field names to be determined at runtime from the IndexReader
provided in the constructor.
this.fieldNames = fieldNames;
|
public void | setMaxNumTokensParsed(int i)
maxNumTokensParsed = i;
|
public void | setMaxQueryTerms(int maxQueryTerms)Sets the maximum number of query terms that will be included in any generated query.
this.maxQueryTerms = maxQueryTerms;
|
public void | setMaxWordLen(int maxWordLen)Sets the maximum word length above which words will be ignored.
this.maxWordLen = maxWordLen;
|
public void | setMinDocFreq(int minDocFreq)Sets the frequency at which words will be ignored which do not occur in at least this
many docs.
this.minDocFreq = minDocFreq;
|
public void | setMinTermFreq(int minTermFreq)Sets the frequency below which terms will be ignored in the source doc.
this.minTermFreq = minTermFreq;
|
public void | setMinWordLen(int minWordLen)Sets the minimum word length below which words will be ignored.
this.minWordLen = minWordLen;
|
public void | setStopWords(java.util.Set stopWords)Set the set of stopwords.
Any word in this set is considered "uninteresting" and ignored.
Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
this.stopWords = stopWords;
|