FileDocCategorySizeDatePackage
BayesianAnalyzer.javaAPI DocApache James 2.3.124103Fri Jan 12 12:56:32 GMT 2007org.apache.james.util

BayesianAnalyzer

public class BayesianAnalyzer extends Object

Determines probability that text contains Spam.

Based upon Paul Grahams' A Plan for Spam. Extended to Paul Grahams' Better Bayesian Filtering.

Sample method usage:

Use: void addHam(Reader) and void addSpam(Reader) methods to build up the Maps of ham & spam tokens/occurrences. Both addHam and addSpam assume they're reading one message at a time, if you feed more than one message per call, be sure to adjust the appropriate message counter: hamMessageCount or spamMessageCount. Then...

Use: void buildCorpus() to build the final token/probabilities Map. Use your own methods for persistent storage of either the individual ham/spam corpus & message counts, and/or the final corpus. Then you can...

Use: double computeSpamProbability(Reader) to determine the probability that a particular text contains spam. A returned result of 0.9 or above is an indicator that the text was spam.

If you use persistent storage, use: void setCorpus(Map) before calling computeSpamProbability.

version
CVS $Revision: $ $Date: $
since
2.3.0

Fields Summary
private static final int
MAX_INTERESTING_TOKENS
Number of "interesting" tokens to use to compute overall spamminess probability.
private static final double
INTERESTINGNESS_THRESHOLD
Minimum probability distance from 0.5 to consider a token "interesting" to use to compute overall spamminess probability.
private static final double
DEFAULT_TOKEN_PROBABILITY
Default token probability to use when a token has not been encountered before.
private Map
hamTokenCounts
Map of ham tokens and their occurrences. String key Integer value
private Map
spamTokenCounts
Map of spam tokens and their occurrences. String key Integer value
private int
hamMessageCount
Number of ham messages analyzed.
private int
spamMessageCount
Number of spam messages analyzed.
private Map
corpus
Final token/probability corpus. String key Double value
Constructors Summary
public BayesianAnalyzer()
Basic class constructor.

    
Methods Summary
public voidaddHam(java.io.Reader stream)
Adds a message to the ham list.

param
stream A reader stream on the ham message to analyze
throws
IOException If any error occurs

        addTokenOccurrences(stream, hamTokenCounts);
        hamMessageCount++;
    
public voidaddSpam(java.io.Reader stream)
Adds a message to the spam list.

param
stream A reader stream on the spam message to analyze
throws
IOException If any error occurs

        addTokenOccurrences(stream, spamTokenCounts);
        spamMessageCount++;
    
private voidaddTokenOccurrences(java.io.Reader stream, java.util.Map target)
Parses a stream into tokens, and updates the target Map with the token/counts.

param
stream
param
target

        String token;
        String header = "";
        
        //Update target with the tokens/count encountered.
        while ((token = nextToken(stream)) != null) {
            boolean endingLine = false;
            if (token.length() > 0 && token.charAt(token.length() - 1) == '\n") {
                endingLine = true;
                token = token.substring(0, token.length() - 1);
            }
            
            if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
                if (token.equals("From:")
                || token.equals("Return-Path:")
                || token.equals("Subject:")
                || token.equals("To:")
                ) {
                    header = token;
                    if (!endingLine) {
                        continue;
                    }
                }
                
                token = header + token;
                
                Integer value = null;
                
                if (target.containsKey(token)) {
                    value = new Integer(((Integer) target.get(token)).intValue() + 1);
                } else {
                    value = new Integer(1);
                }
                
                target.put(token, value);
            }
            
            if (endingLine) {
                header = "";
            }
        }
    
private booleanallDigits(java.lang.String s)

        for (int i = 0; i < s.length(); i++) {
            if (!Character.isDigit(s.charAt(i))) {
                return false;
            }
        }
        return true;
    
private booleanallSameChar(java.lang.String s)

        if (s.length() < 2) {
            return false;
        }
        
        char c = s.charAt(0);
        
        for (int i = 1; i < s.length(); i++) {
            if (s.charAt(i) != c) {
                return false;
            }
        }
        return true;
    
public voidbuildCorpus()
Builds the corpus from the existing ham & spam counts.

        //Combine the known ham & spam tokens.
        Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
        set.addAll(hamTokenCounts.keySet());
        set.addAll(spamTokenCounts.keySet());
        Map tempCorpus = new HashMap(set.size());
        
        //Iterate through all the tokens and compute their new
        //individual probabilities.
        Iterator i = set.iterator();
        while (i.hasNext()) {
            String token = (String) i.next();
            tempCorpus.put(token, new Double(computeProbability(token)));
        }
        setCorpus(tempCorpus);
    
private java.util.CollectionbuildDegenerated(java.lang.String fullToken)

        ArrayList tokens = new ArrayList();
        String header;
        String token;
        
        // look for a header string termination
        int headerEnd = fullToken.indexOf(':");
        if (headerEnd >= 0) {
            header = fullToken.substring(0, headerEnd);
            token = fullToken.substring(headerEnd);
        } else {
            header = "";
            token = fullToken;
        }
        
        int end = token.length();
        do {
            if (!token.substring(0, end).equals(token.substring(0, end).toLowerCase())) {
                tokens.add(header + token.substring(0, end).toLowerCase());
                if (header.length() > 0) {
                    tokens.add(token.substring(0, end).toLowerCase());
                }
            }
            if (end > 1 && token.charAt(0) >= 'A" && token.charAt(0) <= 'Z") {
                tokens.add(header + token.charAt(0) + token.substring(1, end).toLowerCase());
                if (header.length() > 0) {
                    tokens.add(token.charAt(0) + token.substring(1, end).toLowerCase());
                }
            }
            
            if (token.charAt(end - 1) != '!") {
                break;
            }
            
            end--;
            
            tokens.add(header + token.substring(0, end));
            if (header.length() > 0) {
                tokens.add(token.substring(0, end));
            }
        } while (end > 0);
        
        return tokens;
    
public voidclear()
Clears all analysis repositories and counters.

        corpus.clear();
        
        tokenCountsClear();
        
        hamMessageCount = 0;
        spamMessageCount = 0;
    
private doublecomputeOverallProbability(java.util.SortedSet tokenProbabilityStrengths, java.util.Map workCorpus)
Compute the spamminess probability of the interesting tokens in the tokenProbabilities SortedSet.

param
tokenProbabilities
param
workCorpus
return
Computed spamminess.

        double p = 1.0;
        double np = 1.0;
        double tempStrength = 0.5;
        int count = MAX_INTERESTING_TOKENS;
        Iterator iterator = tokenProbabilityStrengths.iterator();
        while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
            TokenProbabilityStrength tps = (TokenProbabilityStrength) iterator.next();
            tempStrength = tps.strength;
            
            //      System.out.println(tps);
            
            double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
            Double theDoubleObject = (Double) workCorpus.get(tps.token);
            // if either the original token or a degeneration was found use the double value, otherwise use the default
            if (theDoubleObject != null) {
                theDoubleValue = theDoubleObject.doubleValue();
            }
            p *= theDoubleValue;
            np *= (1.0 - theDoubleValue);
            // System.out.println("Token:" + tps.token + ", p=" + theDoubleValue + ", overall p=" + p / (p + np));
        }
        
        return (p / (p + np));
    
private doublecomputeProbability(java.lang.String token)
Compute the probability that "token" is SPAM.

param
token
return
The probability that the token occurs within spam.

        double hamFactor  = 0;
        double spamFactor = 0;
        
        boolean foundInHam = false;
        boolean foundInSpam = false;
        
        double minThreshold = 0.01;
        double maxThreshold = 0.99;
        
        if (hamTokenCounts.containsKey(token)) {
            foundInHam = true;
        }
        
        if (spamTokenCounts.containsKey(token)) {
            foundInSpam = true;
        }
        
        if (foundInHam) {
            hamFactor = 2 *((Integer) hamTokenCounts.get(token)).doubleValue();
            if (!foundInSpam) {
                minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
            }
        }
        
        if (foundInSpam) {
            spamFactor = ((Integer) spamTokenCounts.get(token)).doubleValue();
            if (!foundInHam) {
                maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
            }
        }
        
        if ((hamFactor + spamFactor) < 5) {
            //This token hasn't been seen enough.
            return 0.4;
        }
        
        double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
        double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);
        
        return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
    
public doublecomputeSpamProbability(java.io.Reader stream)
Computes the probability that the stream contains SPAM.

param
stream The text to be analyzed for Spamminess.
return
A 0.0 - 1.0 probability
throws
IOException If any error occurs

        //Build a set of the tokens in the Stream.
        Set tokens = parse(stream);
        
        // Get the corpus to use in this run
        // A new corpus may be being built in the meantime
        Map workCorpus = getCorpus();
        
        //Assign their probabilities from the Corpus (using an additional
        //calculation to determine spamminess).
        SortedSet tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);
        
        //Compute and return the overall probability that the
        //stream is SPAM.
        return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
    
public java.util.MapgetCorpus()
Public getter for corpus.

        return this.corpus;
    
public intgetHamMessageCount()
Public getter for hamMessageCount.

        return this.hamMessageCount;
    
public java.util.MapgetHamTokenCounts()
Public getter for the hamTokenCounts Map.

        return this.hamTokenCounts;
    
public intgetSpamMessageCount()
Public getter for spamMessageCount.

        return this.spamMessageCount;
    
public java.util.MapgetSpamTokenCounts()
Public getter for the spamTokenCounts Map.

        return this.spamTokenCounts;
    
private java.util.SortedSetgetTokenProbabilityStrengths(java.util.Set tokens, java.util.Map workCorpus)
Returns a SortedSet of TokenProbabilityStrength built from the Corpus and the tokens passed in the "tokens" Set. The ordering is from the highest strength to the lowest strength.

param
tokens
param
workCorpus
return
SortedSet of TokenProbabilityStrength objects.

        //Convert to a SortedSet of token probability strengths.
        SortedSet tokenProbabilityStrengths = new TreeSet();
        
        Iterator i = tokens.iterator();
        while (i.hasNext()) {
            TokenProbabilityStrength tps = new TokenProbabilityStrength();
            
            tps.token = (String) i.next();
            
            if (workCorpus.containsKey(tps.token)) {
                tps.strength = Math.abs(0.5 - ((Double) workCorpus.get(tps.token)).doubleValue());
            }
            else {
                //This token has never been seen before,
                //we'll give it initially the default probability.
                Double corpusProbability = new Double(DEFAULT_TOKEN_PROBABILITY);
                tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
                boolean isTokenDegeneratedFound = false;
                
                Collection degeneratedTokens = buildDegenerated(tps.token);
                Iterator iDegenerated = degeneratedTokens.iterator();
                String tokenDegenerated = null;
                double strengthDegenerated;
                while (iDegenerated.hasNext()) {
                    tokenDegenerated = (String) iDegenerated.next();
                    if (workCorpus.containsKey(tokenDegenerated)) {
                        Double probabilityTemp = (Double) workCorpus.get(tokenDegenerated);
                        strengthDegenerated = Math.abs(0.5 - probabilityTemp.doubleValue());
                        if (strengthDegenerated > tps.strength) {
                            isTokenDegeneratedFound = true;
                            tps.strength = strengthDegenerated;
                            corpusProbability = probabilityTemp;
                        }
                    }
                }
                // to reduce memory usage, put in the corpus only if the probability is different from (stronger than) the default
                if (isTokenDegeneratedFound) {
                    synchronized(workCorpus) {
                        workCorpus.put(tps.token, corpusProbability);
                    }
                }
            }
            
            tokenProbabilityStrengths.add(tps);
        }
        
        return tokenProbabilityStrengths;
    
private java.lang.StringnextToken(java.io.Reader reader)

        StringBuffer token = new StringBuffer();
        int i;
        char ch, ch2;
        boolean previousWasDigit = false;
        boolean tokenCharFound = false;
        
        if (!reader.ready()) {
            return null;
        }
        
        while ((i = reader.read()) != -1) {
            
            ch = (char) i;
            
            if (ch == ':") {
                String tokenString = token.toString() + ':";
                if (tokenString.equals("From:")
                || tokenString.equals("Return-Path:")
                || tokenString.equals("Subject:")
                || tokenString.equals("To:")
                ) {
                    return tokenString;
                }
            }
            
            if (Character.isLetter(ch)
            || ch == '-"
            || ch == '$"
            || ch == '\u20AC" // the EURO symbol
            || ch == '!"
            || ch == '\'"
            ) {
                tokenCharFound = true;
                previousWasDigit = false;
                token.append(ch);
            } else if (Character.isDigit(ch)) {
                tokenCharFound = true;
                previousWasDigit = true;
                token.append(ch);
            } else if (previousWasDigit && (ch == '." || ch == ',")) {
                reader.mark(1);
                previousWasDigit = false;
                i = reader.read();
                if (i == -1) {
                    break;
                }
                ch2 = (char) i;
                if (Character.isDigit(ch2)) {
                    tokenCharFound = true;
                    previousWasDigit = true;
                    token.append(ch);
                    token.append(ch2);
                } else {
                    reader.reset();
                    break;
                }
            } else if (ch == '\r") {
                // cr found, ignore
            } else if (ch == '\n") {
                // eol found
                tokenCharFound = true;
                previousWasDigit = false;
                token.append(ch);
                break;
            } else if (tokenCharFound) {
                break;
            }
        }
        
        if (tokenCharFound) {
            //          System.out.println("Token read: " + token);
            return token.toString();
        } else {
            return null;
        }
    
private java.util.Setparse(java.io.Reader stream)
Parses a stream into tokens, and returns a Set of the unique tokens encountered.

param
stream
return
Set

        Set tokens = new HashSet();
        String token;
        String header = "";
        
        //Build a Map of tokens encountered.
        while ((token = nextToken(stream)) != null) {
            boolean endingLine = false;
            if (token.length() > 0 && token.charAt(token.length() - 1) == '\n") {
                endingLine = true;
                token = token.substring(0, token.length() - 1);
            }
            
            if (token.length() > 0 && header.length() + token.length() < 90 && !allDigits(token)) {
                if (token.equals("From:")
                || token.equals("Return-Path:")
                || token.equals("Subject:")
                || token.equals("To:")
                ) {
                    header = token;
                    if (!endingLine) {
                        continue;
                    }
                }
                
                token = header + token;
                
                tokens.add(token);
            }
            
            if (endingLine) {
                header = "";
            }
        }
        
        //Return the unique set of tokens encountered.
        return tokens;
    
public voidsetCorpus(java.util.Map corpus)
Public setter for corpus.

param
corpus The new corpus.

        this.corpus = corpus;
    
public voidsetHamMessageCount(int hamMessageCount)
Public setter for hamMessageCount.

param
hamMessageCount The new ham message count.

        this.hamMessageCount = hamMessageCount;
    
public voidsetHamTokenCounts(java.util.Map hamTokenCounts)
Public setter for the hamTokenCounts Map.

param
hamTokenCounts The new ham Token counts Map.

        this.hamTokenCounts = hamTokenCounts;
    
public voidsetSpamMessageCount(int spamMessageCount)
Public setter for spamMessageCount.

param
spamMessageCount The new spam message count.

        this.spamMessageCount = spamMessageCount;
    
public voidsetSpamTokenCounts(java.util.Map spamTokenCounts)
Public setter for the spamTokenCounts Map.

param
spamTokenCounts The new spam Token counts Map.

        this.spamTokenCounts = spamTokenCounts;
    
public voidtokenCountsClear()
Clears token counters.

        hamTokenCounts.clear();
        spamTokenCounts.clear();