From 4d27ef28f74e105ff244b7ec0e636a765c9de02f Mon Sep 17 00:00:00 2001
From: Jeremy Fincher <jemfinch@users.sourceforge.net>
Date: Sun, 3 Oct 2004 09:08:36 +0000
Subject: [PATCH] Initial checkin.

---
 others/reverend/__init__.py |   0
 others/reverend/splitter.py |  96 +++++++++++
 others/reverend/thomas.py   | 309 ++++++++++++++++++++++++++++++++++++
 plugins/Bayes.py            | 204 ++++++++++++++++++++++++
 4 files changed, 609 insertions(+)
 create mode 100755 others/reverend/__init__.py
 create mode 100755 others/reverend/splitter.py
 create mode 100755 others/reverend/thomas.py
 create mode 100644 plugins/Bayes.py

diff --git a/others/reverend/__init__.py b/others/reverend/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/others/reverend/splitter.py b/others/reverend/splitter.py
new file mode 100755
index 000000000..04bfea96e
--- /dev/null
+++ b/others/reverend/splitter.py
@@ -0,0 +1,96 @@
+# This module is part of the Pyndex project and is Copyright 2003 Amir
+# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
+# it and/or modify it under the terms of version 2.1 of the GNU Lesser
+# General Public License as published by the Free Software Foundation.
+
+import string
+
+class Splitter(object):
+    """Split plain text into words" utility class
+    Adapted from David Mertz's article in IBM developerWorks
+    Needs work to handle international characters, etc"""
+
+##    __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
+##                 'nondigits', 'alpha', 'ident', 'tokens', 'position']
+    
+
+    stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
+                 'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
+                 'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
+                 'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
+                 'no': 1, 'these': 1, 'of': 1, 'there': 1,
+                 'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
+                 'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
+
+    yes = string.lowercase + string.digits + '' # throw in any extras
+    nonword = ''
+    for i in range(0,255):
+        if chr(i) not in yes:
+            nonword += chr(i)
+
+    word_only = string.maketrans(nonword, " " * len(nonword))
+    
+    nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
+    alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
+    ident = string.join(map(chr, range(256)), '')
+
+    def close(self):
+        # Lupy support
+        pass
+
+    def tokenStream(self, fieldName, file, casesensitive=False):
+        """Split text/plain string into a list of words
+        """
+        self.tokens = self.split(file.read())
+        self.position = 0
+        return self
+
+    def next(self):
+        if self.position >= len(self.tokens):
+            return None
+        res = Token(self.tokens[self.position])
+        self.position += 1
+        return res
+     
+    def split(self, text, casesensitive=0):
+        # Speedup trick: attributes into local scope
+        word_only = self.word_only
+        ident = self.ident
+        alpha = self.alpha
+        nondigits = self.nondigits
+
+        # Let's adjust case if not case-sensitive
+        if not casesensitive: text = string.lower(text)
+
+        # Split the raw text
+        allwords = text.translate(word_only).split()  # Let's strip funny byte values
+
+        # Finally, let's skip some words not worth indexing
+        words = []
+        for word in allwords:
+            if len(word) > 32: continue         # too long (probably gibberish)
+
+            # Identify common patterns in non-word data (binary, UU/MIME, etc)
+            num_nonalpha = len(word.translate(ident, alpha))
+            numdigits    = len(word.translate(ident, nondigits))
+            if numdigits > len(word)-2:         # almost all digits
+                if numdigits > 5:               # too many digits is gibberish
+                    continue                    # a moderate number is year/zipcode/etc
+            elif num_nonalpha*2 > len(word):    # too much scattered nonalpha = gibberish
+                continue
+
+            word = word.translate(word_only)    # Let's strip funny byte values
+            subwords = word.split()             # maybe embedded non-alphanumeric
+            for subword in subwords:            # ...so we might have subwords
+                if len(subword) <= 1: continue  # too short a subword
+                words.append(subword)
+            
+        return words                
+
+class Token:
+    def __init__(self, trmText):
+        self.trmText = trmText
+
+    def termText(self):
+        return self.trmText
+
diff --git a/others/reverend/thomas.py b/others/reverend/thomas.py
new file mode 100755
index 000000000..7c307bbce
--- /dev/null
+++ b/others/reverend/thomas.py
@@ -0,0 +1,309 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org.  This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+import operator
+import string
+import math
+from sets import Set
+from splitter import Splitter
+
+class BayesData(dict):
+
+    def __init__(self, name='', pool=None):
+        self.name = name
+        self.training = []
+        self.pool = pool
+        self.tokenCount = 0
+        self.trainCount = 0
+        
+    def trainedOn(self, item):
+        return item in self.training
+
+    def __repr__(self):
+        return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
+        
+        
+    
+class Bayes(object):
+    
+    def __init__(self, tokenizer=None, combiner=None, dataClass=None):
+        if dataClass is None:
+            self.dataClass = BayesData
+        else:
+            self.dataClass = dataClass
+        self.corpus = self.dataClass('__Corpus__')
+        self.pools = {}
+        self.pools['__Corpus__'] = self.corpus
+        self.trainCount = 0
+        self.splitter = Splitter()
+        self.dirty = True
+        # The tokenizer takes an object and returns
+        # a list of strings
+        if tokenizer is None:
+            self.tokenizer = self.getTokens
+        else:
+            self.tokenizer = tokenizer
+        # The combiner combines probabilities
+        if combiner is None:
+            self.combiner = self.robinson
+        else:
+            self.combiner = combiner
+
+    def split(self, text):
+        return self.splitter.split(text)
+
+    def commit(self):
+        self.save()
+
+    def newPool(self, poolName):
+        """Create a new pool, without actually doing any
+        training.
+        """
+        self.dirty = True # not always true, but it's simple
+        return self.pools.setdefault(poolName, self.dataClass(poolName))
+
+    def removePool(self, poolName):
+        del(self.pools[poolName])
+        self.dirty = True
+
+    def renamePool(self, poolName, newName):
+        self.pools[newName] = self.pools[poolName]
+        self.pools[newName].name = newName
+        self.removePool(poolName)
+        self.dirty = True
+
+    def mergePools(self, destPool, sourcePool):
+        """Merge an existing pool into another.
+        The data from sourcePool is merged into destPool.
+        The arguments are the names of the pools to be merged.
+        The pool named sourcePool is left in tact and you may
+        want to call removePool() to get rid of it.
+        """
+        sp = self.pools[sourcePool]
+        dp = self.pools[destPool]
+        for tok, count in sp.items():
+            if dp.get(tok):
+                dp[tok] += count
+            else:
+                dp[tok] = count
+                dp.tokenCount += 1
+        self.dirty = True
+
+    def poolData(self, poolName):
+        """Return a list of the (token, count) tuples.
+        """
+        return self.pools[poolName].items()
+
+    def poolTokens(self, poolName):
+        """Return a list of the tokens in this pool.
+        """
+        return [tok for tok, count in self.poolData(poolName)]
+
+    def save(self, fname='bayesdata.dat'):
+        from cPickle import dump
+        fp = open(fname, 'wb')
+        dump(self.pools, fp)
+        fp.close()
+
+    def load(self, fname='bayesdata.dat'):
+        from cPickle import load
+        fp = open(fname, 'rb')
+        self.pools = load(fp)
+        fp.close()
+        self.corpus = self.pools['__Corpus__']
+        self.dirty = True
+
+    def poolNames(self):
+        """Return a sorted list of Pool names.
+        Does not include the system pool '__Corpus__'.
+        """
+        pools = self.pools.keys()
+        pools.remove('__Corpus__')
+        pools = [pool for pool in pools]
+        pools.sort()
+        return pools
+
+    def buildCache(self):
+        """ merges corpora and computes probabilities
+        """
+        self.cache = {}
+        for pname, pool in self.pools.items():
+            # skip our special pool
+            if pname == '__Corpus__':
+                continue
+            
+            poolCount = len(pool)
+            themCount = max(len(self.corpus) - poolCount, 1)
+            cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
+
+            for word, totCount in self.corpus.items():
+                # for every word in the copus
+                # check to see if this pool contains this word
+                thisCount = float(pool.get(word, 0.0))
+                otherCount = float(totCount) - thisCount
+
+                if not poolCount:
+                    goodMetric = 1.0
+                else:
+                    goodMetric = min(1.0, otherCount/poolCount)
+                badMetric = min(1.0, thisCount/themCount)
+                f = badMetric / (goodMetric + badMetric)
+                
+                # PROBABILITY_THRESHOLD
+                if abs(f-0.5) >= 0.1 :
+                    # GOOD_PROB, BAD_PROB
+                    cacheDict[word] = max(0.0001, min(0.9999, f))
+                    
+    def poolProbs(self):
+        if self.dirty:
+            self.buildCache()
+            self.dirty = False
+        return self.cache
+
+    def getTokens(self, obj):
+        """Hopefully it's a string and we'll just split it
+        on non-alphanumeric stuff.
+
+        Override this in your subclass for objects other
+        than text.
+
+        Alternatively, you can pass in a tokenizer as part of
+        instance creation.
+        """
+        return self.split(obj)
+
+    def getProbs(self, pool, words):
+        """ extracts the probabilities of tokens in a message
+        """
+        probs = [(word, pool[word]) for word in words if word in pool]
+        probs.sort(lambda x,y: cmp(y[1],x[1]))
+        return probs[:2048]
+
+    def train(self, pool, item, uid=None):
+        """Train Bayes by telling him that item belongs
+        in pool. uid is optional and may be used to uniquely
+        identify the item that is being trained on.
+        """
+        tokens = self.tokenizer(item)
+        pool = self.pools.setdefault(pool, self.dataClass(pool))
+        self._train(pool, tokens)
+        self.corpus.trainCount += 1
+        pool.trainCount += 1
+        if uid:
+            pool.training.append(uid)
+        self.dirty = True
+
+    def untrain(self, pool, item, uid=None):
+        tokens = self.tokenizer(item)
+        pool = self.pools.get(pool, None)
+        if not pool:
+            return
+        self._untrain(pool, tokens)
+        # I guess we want to count this as additional training?
+        self.corpus.trainCount += 1
+        pool.trainCount += 1
+        if uid:
+            pool.training.remove(uid)
+        self.dirty = True
+
+    def _train(self, pool, tokens):
+        wc = 0
+        for token in tokens:
+            count = pool.get(token, 0)
+            pool[token] =  count + 1
+            count = self.corpus.get(token, 0)
+            self.corpus[token] =  count + 1
+            wc += 1
+        pool.tokenCount += wc
+        self.corpus.tokenCount += wc
+
+    def _untrain(self, pool, tokens):
+        for token in tokens:
+            count = pool.get(token, 0)
+            if count:
+                if count == 1:
+                    del(pool[token])
+                else:
+                    pool[token] =  count - 1
+                pool.tokenCount -= 1
+                
+            count = self.corpus.get(token, 0)
+            if count:
+                if count == 1:
+                    del(self.corpus[token])
+                else:
+                    self.corpus[token] =  count - 1
+                self.corpus.tokenCount -= 1
+
+    def trainedOn(self, msg):            
+        for p in self.cache.values():
+            if msg in p.training:
+                return True
+        return False
+
+    def guess(self, msg):
+        tokens = Set(self.tokenizer(msg))
+        pools = self.poolProbs()
+
+        res = {}
+        for pname, pprobs in pools.items():
+            p = self.getProbs(pprobs, tokens)
+            if len(p) != 0:
+                res[pname]=self.combiner(p, pname)
+        res = res.items()
+        res.sort(lambda x,y: cmp(y[1], x[1]))
+        return res        
+
+    def robinson(self, probs, ignore):
+        """ computes the probability of a message being spam (Robinson's method)
+            P = 1 - prod(1-p)^(1/n)
+            Q = 1 - prod(p)^(1/n)
+            S = (1 + (P-Q)/(P+Q)) / 2
+            Courtesy of http://christophe.delord.free.fr/en/index.html
+        """
+        
+        nth = 1./len(probs)
+        P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
+        Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
+        S = (P - Q) / (P + Q)
+        return (1 + S) / 2
+
+
+    def robinsonFisher(self, probs, ignore):
+        """ computes the probability of a message being spam (Robinson-Fisher method)
+            H = C-1( -2.ln(prod(p)), 2*n )
+            S = C-1( -2.ln(prod(1-p)), 2*n )
+            I = (1 + H - S) / 2
+            Courtesy of http://christophe.delord.free.fr/en/index.html
+        """
+        n = len(probs)
+        try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
+        except OverflowError: H = 0.0
+        try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
+        except OverflowError: S = 0.0
+        return (1 + H - S) / 2
+
+    def __repr__(self):
+        return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
+
+    def __len__(self):
+        return len(self.corpus)
+
+    
+    
+def chi2P(chi, df):
+    """ return P(chisq >= chi, with df degree of freedom)
+
+    df must be even
+    """
+    assert df & 1 == 0
+    m = chi / 2.0
+    sum = term = math.exp(-m)
+    for i in range(1, df/2):
+        term *= m/i
+        sum += term
+    return min(sum, 1.0)
+
diff --git a/plugins/Bayes.py b/plugins/Bayes.py
new file mode 100644
index 000000000..14aef10e6
--- /dev/null
+++ b/plugins/Bayes.py
@@ -0,0 +1,204 @@
+###
+# Copyright (c) 2004, Jeremiah Fincher
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#   * Redistributions of source code must retain the above copyright notice,
+#     this list of conditions, and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions, and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   * Neither the name of the author of this software nor the name of
+#     contributors to this software may be used to endorse or promote products
+#     derived from this software without specific prior written consent.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###
+
+"""
+Watches for paste-floods in a channel and takes appropriate measures against
+violators.
+"""
+
+import supybot
+
+__revision__ = "$Id$"
+__author__ = supybot.authors.jemfinch
+__contributors__ = {}
+
+import supybot.plugins as plugins
+
+import glob
+import os.path
+import reverend.thomas
+from cStringIO import StringIO as sio
+
+import supybot.conf as conf
+import supybot.utils as utils
+from supybot.commands import *
+import supybot.ircutils as ircutils
+import supybot.registry as registry
+import supybot.callbacks as callbacks
+
+
+def configure(advanced):
+    # This will be called by setup.py to configure this module.  Advanced is
+    # a bool that specifies whether the user identified himself as an advanced
+    # user or not.  You should effect your configuration by manipulating the
+    # registry as appropriate.
+    from supybot.questions import expect, anything, something, yn
+    conf.registerPlugin('Bayes', True)
+
+Bayes = conf.registerPlugin('Bayes')
+conf.registerChannelValue(Bayes, 'maximumLines',
+    registry.NonNegativeInteger(4, """Determines the maximum allowable number
+    of consecutive messages that classify as a paste.  If this value is 0, no
+    checking will be done."""))
+
+def tokenize(s):
+    return s.lower().split()
+
+class PickleBayesDB(plugins.DbiChannelDB):
+    class DB(object):
+        def __init__(self, filename):
+            self.filename = filename
+            self.nickFilename = self.filename.replace('pickle', 'nick.pickle')
+            self.bayes = reverend.thomas.Bayes(tokenize)
+            if os.path.exists(filename):
+                self.bayes.load(filename)
+            self.nickBayes = reverend.thomas.Bayes(tokenize)
+            if os.path.exists(nickFilename):
+                self.nickBayes.load(self.nickFilename)
+
+        def close(self):
+            self.bayes.save(self.filename)
+            self.nickBayes.save(self.nickFilename)
+        flush = close
+
+        def train(self, kind, s):
+            self.bayes.train(kind, s)
+
+        def trainNick(self, nick, s):
+            self.nickBayes.train(nick, s)
+
+        def guess(self, s):
+            matches = self.bayes.guess(s)
+            if matches:
+                if matches[0][1] > 0.5:
+                    if len(matches) > 1 and \
+                       matches[0][1] - matches[1][1] < 0.4:
+                        return None
+                    else:
+                        return matches[0]
+            else:
+                self.bayes.train('normal', s)
+                return None
+
+        def guessNick(self, s):
+            L = [t for t in self.nickBayes.guess(s) if t[1] > 0.01]
+            if len(L) > 1:
+                if L[0][1] / L[1][1] > 2:
+                    return [L[0]]
+            return L
+
+BayesDB = plugins.DB('Bayes', {'pickle': PickleBayesDB})
+
+class Bayes(callbacks.Privmsg):
+    def __init__(self):
+        self.__parent = super(Bayes, self)
+        self.__parent.__init__()
+        self.db = BayesDB()
+
+    def die(self):
+        self.db.close()
+
+    def doPrivmsg(self, irc, msg):
+        (channel, text) = msg.args
+        if not ircutils.isChannel(channel) or msg.guessed:
+            return
+        kind = self.db.guess(channel, text)
+        if kind is not None:
+            (kind, prob) = kind
+            prob *= 100
+            text = utils.ellipsisify(text, 30)
+            self.log.warning('Classified %r as %s. (%.2f%%)', text, kind, prob)
+        self.db.trainNick(channel, msg.nick, text)
+                
+    def guess(self, irc, msg, args, channel, text):
+        """[<channel>] <text>
+
+        Guesses how <text> should be classified according to the Bayesian
+        classifier for <channel>.  <channel> is only necessary if the message
+        isn't sent in the channel itself, and then only if
+        supybot.databases.plugins.channelSpecific is True.
+        """
+        msg.tag('guessed')
+        kind = self.db.guess(channel, text)
+        if kind is not None:
+            (kind, prob) = kind
+            prob *= 100
+            irc.reply('That seems to me to be %s, '
+                      'but I\'m only %.2f certain.' % (kind, prob))
+        else:
+            irc.reply('I don\'t know what the heck that is.')
+    guess = wrap(guess, ['channeldb', 'something'])
+
+    def who(self, irc, msg, args, channel, text):
+        """[<channel>] <text>
+
+        Guesses who might have said <text>.  <channel> is only necessary if the
+        message isn't sent in the channel itself, and then only if
+        supybot.databases.plugins.channelSpecific is True.
+        """
+        msg.tag('guessed')
+        kinds = self.db.guessNick(channel, text)
+        if kinds:
+            if len(kinds) == 1:
+                (kind, prob) = kinds.pop()
+                irc.reply('It seems to me (with %.2f%% certainty) '
+                          'that %s said that.' % (prob*100, kind))
+            else:
+                kinds = ['%s (%.2f%%)' % (k, prob*100) for (k, prob) in kinds]
+                irc.reply('I\'m not quite sure who said that, but it could be '
+                          + utils.commaAndify(kinds, And='or'))
+        else:
+            irc.reply('I have no idea who might\'ve said that.')
+    who = wrap(who, ['channeldb', 'something'])
+        
+    def train(self, irc, msg, args, channel, language, pattern):
+        """[<channel>] <language> <glob>
+
+
+        Trains the bot to recognize text similar to that contained in the files
+        matching <glob> as text of the language <language>.  <channel> is only
+        necessary if the message isn't sent in the channel itself, and then
+        only if supybot.databases.plugins.channelSpecific is True.
+        """
+        filenames = glob.glob(pattern)
+        if not filenames:
+            irc.errorInvalid('glob', pattern)
+        for filename in filenames:
+            fd = file(filename)
+            for line in fd:
+                self.db.train(channel, language, line)
+            fd.close()
+        irc.replySuccess()
+    train = wrap(train, ['channeldb', 'something', 'something'])
+            
+        
+
+Class = Bayes
+
+# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: