FileDocCategorySizeDatePackage
BayesianAnalysis.javaAPI DocApache James 2.3.118013Fri Jan 12 12:56:28 GMT 2007org.apache.james.transport.mailets

BayesianAnalysis

public class BayesianAnalysis extends org.apache.mailet.GenericMailet

Spam detection mailet using bayesian analysis techniques.

Sets an email message header indicating the probability that an email message is SPAM.

Based upon the principals described in: A Plan For Spam by Paul Graham. Extended to Paul Grahams' Better Bayesian Filtering.

The analysis capabilities are based on token frequencies (the Corpus) learned through a training process (see {@link BayesianAnalysisFeeder}) and stored in a JDBC database. After a training session, the Corpus must be rebuilt from the database in order to acquire the new frequencies. Every 10 minutes a special thread in this mailet will check if any change was made to the database by the feeder, and rebuild the corpus if necessary.

A org.apache.james.spam.probability mail attribute will be created containing the computed spam probability as a {@link java.lang.Double}. The headerName message header string will be created containing such probability in floating point representation.

Sample configuration:


<mailet match="All" class="BayesianAnalysis">
<repositoryPath>db://maildb</repositoryPath>
<!--
Set this to the header name to add with the spam probability
(default is "X-MessageIsSpamProbability").
-->
<headerName>X-MessageIsSpamProbability</headerName>
<!--
Set this to true if you want to ignore messages coming from local senders
(default is false).
By local sender we mean a return-path with a local server part (server listed
in <servernames> in config.xml).
-->
<ignoreLocalSender>true</ignoreLocalSender>
<!--
Set this to the maximum message size (in bytes) that a message may have
to be considered spam (default is 100000).
-->
<maxSize>100000</maxSize>
</mailet>

The probability of being spam is pre-pended to the subject if it is > 0.1 (10%).

The required tables are automatically created if not already there (see sqlResources.xml). The token field in both the ham and spam tables is case sensitive.

see
BayesianAnalysisFeeder
see
org.apache.james.util.BayesianAnalyzer
see
org.apache.james.util.JDBCBayesianAnalyzer
version
CVS $Revision: $ $Date: $
since
2.3.0

Fields Summary
private final org.apache.james.util.JDBCUtil
theJDBCUtil
The JDBCUtil helper class
private org.apache.james.util.JDBCBayesianAnalyzer
analyzer
The JDBCBayesianAnalyzer class that does all the work.
private org.apache.avalon.excalibur.datasource.DataSourceComponent
datasource
private String
repositoryPath
private static final String
MAIL_ATTRIBUTE_NAME
private static final String
HEADER_NAME
private static final long
CORPUS_RELOAD_INTERVAL
private String
headerName
private boolean
ignoreLocalSender
private org.apache.mailet.dates.RFC822DateFormat
rfc822DateFormat
The date format object used to generate RFC 822 compliant date headers.
private int
maxSize
Holds value of property maxSize.
private long
lastCorpusLoadTime
Holds value of property lastCorpusLoadTime.
Constructors Summary
Methods Summary
private voidappendToSubject(javax.mail.internet.MimeMessage message, java.lang.String toAppend)

        try {
            String subject = message.getSubject();
            
            if (subject == null) {
                message.setSubject(toAppend, "iso-8859-1");
            } else {
                message.setSubject(toAppend + " " + subject, "iso-8859-1");
            }
        } catch (MessagingException ex) {}
    
private java.lang.StringgetAddressesString(java.util.Collection addresses)

        if (addresses == null) {
            return "null";
        }
        
        Iterator iter = addresses.iterator();
        StringBuffer sb = new StringBuffer();
        sb.append('[");
        for (int i = 0; iter.hasNext(); i++) {
            sb.append(iter.next());
            if (i + 1 < addresses.size()) {
                sb.append(", ");
            }
        }
        sb.append(']");
        return sb.toString();
    
public longgetLastCorpusLoadTime()
Getter for property lastCorpusLoadTime.

return
Value of property lastCorpusLoadTime.

        
        return this.lastCorpusLoadTime;
    
public java.lang.StringgetMailetInfo()
Return a string describing this mailet.

return
a string describing this mailet

    
                     
       
        return "BayesianAnalysis Mailet";
    
public intgetMaxSize()
Getter for property maxSize.

return
Value of property maxSize.

    
                  
       

        return this.maxSize;
    
public voidinit()
Mailet initialization routine.

throws
MessagingException if a problem arises

        repositoryPath = getInitParameter("repositoryPath");
        
        if (repositoryPath == null) {
            throw new MessagingException("repositoryPath is null");
        }
        
        headerName = getInitParameter("headerName",HEADER_NAME);
        
        ignoreLocalSender = Boolean.valueOf(getInitParameter("ignoreLocalSender")).booleanValue();
        
        if (ignoreLocalSender) {
            log("Will ignore messages coming from local senders");
        } else {
            log("Will analyze messages coming from local senders");
        }
        
        String maxSizeParam = getInitParameter("maxSize");
        if (maxSizeParam != null) {
            setMaxSize(Integer.parseInt(maxSizeParam));
        }
        log("maxSize: " + getMaxSize());
        
        initDb();
        
            CorpusLoader corpusLoader = new CorpusLoader(this);
            corpusLoader.setDaemon(true);
            corpusLoader.start();
            
    
private voidinitDb()

        
        try {
            ServiceManager serviceManager = (ServiceManager) getMailetContext().getAttribute(Constants.AVALON_COMPONENT_MANAGER);
            
            // Get the DataSourceSelector block
            DataSourceSelector datasources = (DataSourceSelector) serviceManager.lookup(DataSourceSelector.ROLE);
            
            // Get the data-source required.
            int stindex =   repositoryPath.indexOf("://") + 3;
            
            String datasourceName = repositoryPath.substring(stindex);
            
            datasource = (DataSourceComponent) datasources.select(datasourceName);
        } catch (Exception e) {
            throw new MessagingException("Can't get datasource", e);
        }
        
        try {
            analyzer.initSqlQueries(datasource.getConnection(), getMailetContext());
        } catch (Exception e) {
            throw new MessagingException("Exception initializing queries", e);
        }        
        
        try {
            loadData(datasource.getConnection());
        } catch (java.sql.SQLException se) {
            throw new MessagingException("SQLException loading data", se);
        }        
    
private voidloadData(java.sql.Connection conn)

        
        try {
            // this is synchronized to avoid concurrent update of the corpus
            synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
                analyzer.tokenCountsClear();
                analyzer.loadHamNSpam(conn);
                analyzer.buildCorpus();
                analyzer.tokenCountsClear();
            }
            
            log("BayesianAnalysis Corpus loaded");
            
            touchLastCorpusLoadTime();
            
        } finally {
            if (conn != null) {
                theJDBCUtil.closeJDBCConnection(conn);
            }
        }
        
    
private voidsaveChanges(javax.mail.internet.MimeMessage message)
Saves changes resetting the original message id.

        String messageId = message.getMessageID();
        message.saveChanges();
        if (messageId != null) {
            message.setHeader(RFC2822Headers.MESSAGE_ID, messageId);
        }
    
private voidsendReplyFromPostmaster(org.apache.mailet.Mail mail, java.lang.String stringContent)

        try {
            MailAddress notifier = getMailetContext().getPostmaster();
            
            MailAddress senderMailAddress = mail.getSender();
            
            MimeMessage message = mail.getMessage();
            //Create the reply message
            MimeMessage reply = new MimeMessage(Session.getDefaultInstance(System.getProperties(), null));
            
            //Create the list of recipients in the Address[] format
            InternetAddress[] rcptAddr = new InternetAddress[1];
            rcptAddr[0] = senderMailAddress.toInternetAddress();
            reply.setRecipients(Message.RecipientType.TO, rcptAddr);
            
            //Set the sender...
            reply.setFrom(notifier.toInternetAddress());
            
            //Create the message body
            MimeMultipart multipart = new MimeMultipart();
            //Add message as the first mime body part
            MimeBodyPart part = new MimeBodyPart();
            part.setContent(stringContent, "text/plain");
            part.setHeader(RFC2822Headers.CONTENT_TYPE, "text/plain");
            multipart.addBodyPart(part);
            
            reply.setContent(multipart);
            reply.setHeader(RFC2822Headers.CONTENT_TYPE, multipart.getContentType());
            
            //Create the list of recipients in our MailAddress format
            Set recipients = new HashSet();
            recipients.add(senderMailAddress);
            
            //Set additional headers
            if (reply.getHeader(RFC2822Headers.DATE)==null){
                reply.setHeader(RFC2822Headers.DATE, rfc822DateFormat.format(new java.util.Date()));
            }
            String subject = message.getSubject();
            if (subject == null) {
                subject = "";
            }
            if (subject.indexOf("Re:") == 0){
                reply.setSubject(subject);
            } else {
                reply.setSubject("Re:" + subject);
            }
            reply.setHeader(RFC2822Headers.IN_REPLY_TO, message.getMessageID());
            
            //Send it off...
            getMailetContext().sendMail(notifier, recipients, reply);
        } catch (Exception e) {
            log("Exception found sending reply", e);
        }
    
public voidservice(org.apache.mailet.Mail mail)
Scans the mail and determines the spam probability.

param
mail The Mail message to be scanned.
throws
MessagingException if a problem arises

        
        try {
            MimeMessage message = mail.getMessage();
            
            if (ignoreLocalSender) {
                // ignore the message if the sender is local
                if (mail.getSender() != null
                        && getMailetContext().isLocalServer(mail.getSender().getHost())) {
                    return;
                }
            }
            
            String [] headerArray = message.getHeader(headerName);
            // ignore the message if already analyzed
            if (headerArray != null && headerArray.length > 0) {
                return;
            }
            
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            
            double probability;
            
            if (message.getSize() < getMaxSize()) {
                message.writeTo(baos);
                probability = analyzer.computeSpamProbability(new BufferedReader(new StringReader(baos.toString())));
            } else {
                probability = 0.0;
            }
            
            mail.setAttribute(MAIL_ATTRIBUTE_NAME, new Double(probability));
            message.setHeader(headerName, Double.toString(probability));
            
            DecimalFormat probabilityForm = (DecimalFormat) DecimalFormat.getInstance();
            probabilityForm.applyPattern("##0.##%");
            String probabilityString = probabilityForm.format(probability);
            
            String senderString;
            if (mail.getSender() == null) {
                senderString = "null";
            } else {
                senderString = mail.getSender().toString();
            }
            if (probability > 0.1) {
                log(headerName
                        + ": "
                        + probabilityString
                        + "; From: "
                        + senderString
                        + "; Recipient(s): "
                        + getAddressesString(mail.getRecipients()));
                
                appendToSubject(message,
                        " [" + probabilityString
                        + (probability > 0.9 ? " SPAM" : " spam") + "]");
            }
            
            saveChanges(message);
            
        } catch (Exception e) {
            log("Exception: "
                    + e.getMessage(), e);
            throw new MessagingException("Exception thrown", e);
        }
    
public voidsetMaxSize(int maxSize)
Setter for property maxSize.

param
maxSize New value of property maxSize.


        this.maxSize = maxSize;
    
private voidtouchLastCorpusLoadTime()
Sets lastCorpusLoadTime to System.currentTimeMillis().

        
        this.lastCorpusLoadTime = System.currentTimeMillis();