File Doc Category Size Date Package
BayesianAnalyzer.java API Doc Apache James 2.3.1 24103 Fri Jan 12 12:56:32 GMT 2007 org.apache.james.util

BayesianAnalyzer

java.lang.Object

public class BayesianAnalyzer extends Object

Determines probability that text contains Spam.

Based upon Paul Grahams' A Plan for Spam. Extended to Paul Grahams' Better Bayesian Filtering.

Sample method usage:

Use: void addHam(Reader) and void addSpam(Reader) methods to build up the Maps of ham & spam tokens/occurrences. Both addHam and addSpam assume they're reading one message at a time, if you feed more than one message per call, be sure to adjust the appropriate message counter: hamMessageCount or spamMessageCount. Then...

Use: void buildCorpus() to build the final token/probabilities Map. Use your own methods for persistent storage of either the individual ham/spam corpus & message counts, and/or the final corpus. Then you can...

Use: double computeSpamProbability(Reader) to determine the probability that a particular text contains spam. A returned result of 0.9 or above is an indicator that the text was spam.

If you use persistent storage, use: void setCorpus(Map) before calling computeSpamProbability.

version: CVS $Revision: $ $Date: $
since: 2.3.0

Fields Summary
private static final int
MAX_INTERESTING_TOKENS
Number of "interesting" tokens to use to compute overall spamminess probability.
private static final double
INTERESTINGNESS_THRESHOLD
Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall spamminess probability.
private static final double
DEFAULT_TOKEN_PROBABILITY
Default token probability to use when a token has not been encountered before.
private Map
hamTokenCounts
Map of ham tokens and their occurrences. String key Integer value
private Map
spamTokenCounts
Map of spam tokens and their occurrences. String key Integer value
private int
hamMessageCount
Number of ham messages analyzed.
private int
spamMessageCount
Number of spam messages analyzed.
private Map
corpus
Final token/probability corpus. String key Double value
Constructors Summary
public BayesianAnalyzer()
Basic class constructor.

Methods Summary
public void addHam(java.io.Reader stream)
Adds a message to the ham list.
param
stream A reader stream on the ham message to analyze
throws
IOException If any error occurs
addTokenOccurrences(stream, hamTokenCounts); hamMessageCount++;
public void addSpam(java.io.Reader stream)
Adds a message to the spam list.
param
stream A reader stream on the spam message to analyze
throws
IOException If any error occurs
addTokenOccurrences(stream, spamTokenCounts); spamMessageCount++;
private void addTokenOccurrences(java.io.Reader stream, java.util.Map target)
Parses a stream into tokens, and updates the target Map with the token/counts.
param
stream
param
target
String token; String header = ""; //Update target with the tokens/count encountered. while ((token = nextToken(stream)) != null) { boolean endingLine = false; if (token.length() > 0 && token.charAt(token.length() - 1) == '\n") { endingLine = true; token = token.substring(0, token.length() - 1); } if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) { if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:") ) { header = token; if (!endingLine) { continue; } } token = header + token; Integer value = null; if (target.containsKey(token)) { value = new Integer(((Integer) target.get(token)).intValue() + 1); } else { value = new Integer(1); } target.put(token, value); } if (endingLine) { header = ""; } }
private boolean allDigits(java.lang.String s)
for (int i = 0; i < s.length(); i++) { if (!Character.isDigit(s.charAt(i))) { return false; } } return true;
private boolean allSameChar(java.lang.String s)
if (s.length() < 2) { return false; } char c = s.charAt(0); for (int i = 1; i < s.length(); i++) { if (s.charAt(i) != c) { return false; } } return true;
public void buildCorpus()
Builds the corpus from the existing ham & spam counts.
//Combine the known ham & spam tokens. Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size()); set.addAll(hamTokenCounts.keySet()); set.addAll(spamTokenCounts.keySet()); Map tempCorpus = new HashMap(set.size()); //Iterate through all the tokens and compute their new //individual probabilities. Iterator i = set.iterator(); while (i.hasNext()) { String token = (String) i.next(); tempCorpus.put(token, new Double(computeProbability(token))); } setCorpus(tempCorpus);
private java.util.Collection buildDegenerated(java.lang.String fullToken)
ArrayList tokens = new ArrayList(); String header; String token; // look for a header string termination int headerEnd = fullToken.indexOf(':"); if (headerEnd >= 0) { header = fullToken.substring(0, headerEnd); token = fullToken.substring(headerEnd); } else { header = ""; token = fullToken; } int end = token.length(); do { if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) { tokens.add(header + token.substring(0, end).toLowerCase()); if (header.length() > 0) { tokens.add(token.substring(0, end).toLowerCase()); } } if (end > 1 && token.charAt(0) >= 'A" && token.charAt(0) <= 'Z") { tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase()); if (header.length() > 0) { tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase()); } } if (token.charAt(end - 1) != '!") { break; } end--; tokens.add(header + token.substring(0, end)); if (header.length() > 0) { tokens.add(token.substring(0, end)); } } while (end > 0); return tokens;
public void clear()
Clears all analysis repositories and counters.
corpus.clear(); tokenCountsClear(); hamMessageCount = 0; spamMessageCount = 0;
private double computeOverallProbability(java.util.SortedSet tokenProbabilityStrengths, java.util.Map workCorpus)
Compute the spamminess probability of the interesting tokens in the tokenProbabilities SortedSet.
param
tokenProbabilities
param
workCorpus
return
Computed spamminess.
double p = 1.0; double np = 1.0; double tempStrength = 0.5; int count = MAX_INTERESTING_TOKENS; Iterator iterator = tokenProbabilityStrengths.iterator(); while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) { TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next(); tempStrength = tps.strength; // System.out.println(tps); double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default Double theDoubleObject = (Double) workCorpus.get(tps.token); // if either the original token or a degeneration was found use the double value, otherwise use the default if (theDoubleObject != null) { theDoubleValue = theDoubleObject.doubleValue(); } p *= theDoubleValue; np *= (1.0 - theDoubleValue); // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np)); } return (p / (p + np));
private double computeProbability(java.lang.String token)
Compute the probability that "token" is SPAM.
param
token
return
The probability that the token occurs within spam.
double hamFactor = 0; double spamFactor = 0; boolean foundInHam = false; boolean foundInSpam = false; double minThreshold = 0.01; double maxThreshold = 0.99; if (hamTokenCounts.containsKey(token)) { foundInHam = true; } if (spamTokenCounts.containsKey(token)) { foundInSpam = true; } if (foundInHam) { hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue(); if (!foundInSpam) { minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002; } } if (foundInSpam) { spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue(); if (!foundInHam) { maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998; } } if ((hamFactor + spamFactor) < 5) { //This token hasn't been seen enough. return 0.4; } double spamFreq = Math.min(1.0, spamFactor / spamMessageCount); double hamFreq = Math.min(1.0, hamFactor / hamMessageCount); return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
public double computeSpamProbability(java.io.Reader stream)
Computes the probability that the stream contains SPAM.
param
stream The text to be analyzed for Spamminess.
return
A 0.0 - 1.0 probability
throws
IOException If any error occurs
//Build a set of the tokens in the Stream. Set tokens = parse(stream); // Get the corpus to use in this run // A new corpus may be being built in the meantime Map workCorpus = getCorpus(); //Assign their probabilities from the Corpus (using an additional //calculation to determine spamminess). SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus); //Compute and return the overall probability that the //stream is SPAM. return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
public java.util.Map getCorpus()
Public getter for corpus.
return this.corpus;
public int getHamMessageCount()
Public getter for hamMessageCount.
return this.hamMessageCount;
public java.util.Map getHamTokenCounts()
Public getter for the hamTokenCounts Map.
return this.hamTokenCounts;
public int getSpamMessageCount()
Public getter for spamMessageCount.
return this.spamMessageCount;
public java.util.Map getSpamTokenCounts()
Public getter for the spamTokenCounts Map.
return this.spamTokenCounts;
private java.util.SortedSet getTokenProbabilityStrengths(java.util.Set tokens, java.util.Map workCorpus)
Returns a SortedSet of TokenProbabilityStrength built from the Corpus and the tokens passed in the "tokens" Set. The ordering is from the highest strength to the lowest strength.
param
tokens
param
workCorpus
return
SortedSet of TokenProbabilityStrength objects.
//Convert to a SortedSet of token probability strengths. SortedSet tokenProbabilityStrengths = new TreeSet(); Iterator i = tokens.iterator(); while (i.hasNext()) { TokenProbabilityStrength tps = new TokenProbabilityStrength(); tps.token = (String) i.next(); if (workCorpus.containsKey(tps.token)) { tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue()); } else { //This token has never been seen before, //we'll give it initially the default probability. Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY); tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY); boolean isTokenDegeneratedFound = false; Collection degeneratedTokens = buildDegenerated(tps.token); Iterator iDegenerated = degeneratedTokens.iterator(); String tokenDegenerated = null; double strengthDegenerated; while (iDegenerated.hasNext()) { tokenDegenerated = (String) iDegenerated.next(); if (workCorpus.containsKey(tokenDegenerated)) { Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated); strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue()); if (strengthDegenerated > tps.strength) { isTokenDegeneratedFound = true; tps.strength = strengthDegenerated; corpusProbability = probabilityTemp; } } } // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default if (isTokenDegeneratedFound) { synchronized(workCorpus) { workCorpus.put(tps.token, corpusProbability); } } } tokenProbabilityStrengths.add(tps); } return tokenProbabilityStrengths;
private java.lang.String nextToken(java.io.Reader reader)
StringBuffer token = new StringBuffer(); int i; char ch, ch2; boolean previousWasDigit = false; boolean tokenCharFound = false; if (!reader.ready()) { return null; } while ((i = reader.read()) != -1) { ch = (char) i; if (ch == ':") { String tokenString = token.toString() + ':"; if (tokenString.equals("From:") || tokenString.equals("Return-Path:") || tokenString.equals("Subject:") || tokenString.equals("To:") ) { return tokenString; } } if (Character.isLetter(ch) || ch == '-" || ch == '$" || ch == '\u20AC" // the EURO symbol || ch == '!" || ch == '\'" ) { tokenCharFound = true; previousWasDigit = false; token.append(ch); } else if (Character.isDigit(ch)) { tokenCharFound = true; previousWasDigit = true; token.append(ch); } else if (previousWasDigit && (ch == '." || ch == ',")) { reader.mark(1); previousWasDigit = false; i = reader.read(); if (i == -1) { break; } ch2 = (char) i; if (Character.isDigit(ch2)) { tokenCharFound = true; previousWasDigit = true; token.append(ch); token.append(ch2); } else { reader.reset(); break; } } else if (ch == '\r") { // cr found, ignore } else if (ch == '\n") { // eol found tokenCharFound = true; previousWasDigit = false; token.append(ch); break; } else if (tokenCharFound) { break; } } if (tokenCharFound) { // System.out.println("Token read: " + token); return token.toString(); } else { return null; }
private java.util.Set parse(java.io.Reader stream)
Parses a stream into tokens, and returns a Set of the unique tokens encountered.
param
stream
return
Set
Set tokens = new HashSet(); String token; String header = ""; //Build a Map of tokens encountered. while ((token = nextToken(stream)) != null) { boolean endingLine = false; if (token.length() > 0 && token.charAt(token.length() - 1) == '\n") { endingLine = true; token = token.substring(0, token.length() - 1); } if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) { if (token.equals("From:") || token.equals("Return-Path:") || token.equals("Subject:") || token.equals("To:") ) { header = token; if (!endingLine) { continue; } } token = header + token; tokens.add(token); } if (endingLine) { header = ""; } } //Return the unique set of tokens encountered. return tokens;
public void setCorpus(java.util.Map corpus)
Public setter for corpus.
param
corpus The new corpus.
this.corpus = corpus;
public void setHamMessageCount(int hamMessageCount)
Public setter for hamMessageCount.
param
hamMessageCount The new ham message count.
this.hamMessageCount = hamMessageCount;
public void setHamTokenCounts(java.util.Map hamTokenCounts)
Public setter for the hamTokenCounts Map.
param
hamTokenCounts The new ham Token counts Map.
this.hamTokenCounts = hamTokenCounts;
public void setSpamMessageCount(int spamMessageCount)
Public setter for spamMessageCount.
param
spamMessageCount The new spam message count.
this.spamMessageCount = spamMessageCount;
public void setSpamTokenCounts(java.util.Map spamTokenCounts)
Public setter for the spamTokenCounts Map.
param
spamTokenCounts The new spam Token counts Map.
this.spamTokenCounts = spamTokenCounts;
public void tokenCountsClear()
Clears token counters.
hamTokenCounts.clear(); spamTokenCounts.clear();