Initial checkin.

2026-01-23 19:18:04 +01:00 · 2004-10-03 09:08:36 +00:00 · 2004-10-03 09:08:36 +00:00 · 4d27ef28f7
commit 4d27ef28f7
parent c563596405
4 changed files with 609 additions and 0 deletions
--- a/others/reverend/init.py
+++ b/others/reverend/init.py
--- a/others/reverend/splitter.py
+++ b/others/reverend/splitter.py
@ -0,0 +1,96 @@
 # This module is part of the Pyndex project and is Copyright 2003 Amir
 # Bakhtiar (amir@divmod.org). This is free software; you can redistribute
 # it and/or modify it under the terms of version 2.1 of the GNU Lesser
 # General Public License as published by the Free Software Foundation.
 import string
 class Splitter(object):
    """Split plain text into words" utility class
    Adapted from David Mertz's article in IBM developerWorks
    Needs work to handle international characters, etc"""
 ##    __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
 ##                 'nondigits', 'alpha', 'ident', 'tokens', 'position']
    stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
                 'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
                 'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
                 'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
                 'no': 1, 'these': 1, 'of': 1, 'there': 1,
                 'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
                 'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
    yes = string.lowercase + string.digits + '' # throw in any extras
    nonword = ''
    for i in range(0,255):
        if chr(i) not in yes:
            nonword += chr(i)
    word_only = string.maketrans(nonword, " " * len(nonword))
    nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
    alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
    ident = string.join(map(chr, range(256)), '')
    def close(self):
        # Lupy support
        pass
    def tokenStream(self, fieldName, file, casesensitive=False):
        """Split text/plain string into a list of words
        """
        self.tokens = self.split(file.read())
        self.position = 0
        return self
    def next(self):
        if self.position >= len(self.tokens):
            return None
        res = Token(self.tokens[self.position])
        self.position += 1
        return res
    def split(self, text, casesensitive=0):
        # Speedup trick: attributes into local scope
        word_only = self.word_only
        ident = self.ident
        alpha = self.alpha
        nondigits = self.nondigits
        # Let's adjust case if not case-sensitive
        if not casesensitive: text = string.lower(text)
        # Split the raw text
        allwords = text.translate(word_only).split()  # Let's strip funny byte values
        # Finally, let's skip some words not worth indexing
        words = []
        for word in allwords:
            if len(word) > 32: continue         # too long (probably gibberish)
            # Identify common patterns in non-word data (binary, UU/MIME, etc)
            num_nonalpha = len(word.translate(ident, alpha))
            numdigits    = len(word.translate(ident, nondigits))
            if numdigits > len(word)-2:         # almost all digits
                if numdigits > 5:               # too many digits is gibberish
                    continue                    # a moderate number is year/zipcode/etc
            elif num_nonalpha*2 > len(word):    # too much scattered nonalpha = gibberish
                continue
            word = word.translate(word_only)    # Let's strip funny byte values
            subwords = word.split()             # maybe embedded non-alphanumeric
            for subword in subwords:            # ...so we might have subwords
                if len(subword) <= 1: continue  # too short a subword
                words.append(subword)
        return words                
 class Token:
    def __init__(self, trmText):
        self.trmText = trmText
    def termText(self):
        return self.trmText
--- a/others/reverend/thomas.py
+++ b/others/reverend/thomas.py
@ -0,0 +1,309 @@
 # This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
 # amir@divmod.org.  This is free software; you can redistribute it and/or
 # modify it under the terms of version 2.1 of the GNU Lesser General Public
 # License as published by the Free Software Foundation.
 #
 import operator
 import string
 import math
 from sets import Set
 from splitter import Splitter
 class BayesData(dict):
    def __init__(self, name='', pool=None):
        self.name = name
        self.training = []
        self.pool = pool
        self.tokenCount = 0
        self.trainCount = 0
    def trainedOn(self, item):
        return item in self.training
    def __repr__(self):
        return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
 class Bayes(object):
    def __init__(self, tokenizer=None, combiner=None, dataClass=None):
        if dataClass is None:
            self.dataClass = BayesData
        else:
            self.dataClass = dataClass
        self.corpus = self.dataClass('__Corpus__')
        self.pools = {}
        self.pools['__Corpus__'] = self.corpus
        self.trainCount = 0
        self.splitter = Splitter()
        self.dirty = True
        # The tokenizer takes an object and returns
        # a list of strings
        if tokenizer is None:
            self.tokenizer = self.getTokens
        else:
            self.tokenizer = tokenizer
        # The combiner combines probabilities
        if combiner is None:
            self.combiner = self.robinson
        else:
            self.combiner = combiner
    def split(self, text):
        return self.splitter.split(text)
    def commit(self):
        self.save()
    def newPool(self, poolName):
        """Create a new pool, without actually doing any
        training.
        """
        self.dirty = True # not always true, but it's simple
        return self.pools.setdefault(poolName, self.dataClass(poolName))
    def removePool(self, poolName):
        del(self.pools[poolName])
        self.dirty = True
    def renamePool(self, poolName, newName):
        self.pools[newName] = self.pools[poolName]
        self.pools[newName].name = newName
        self.removePool(poolName)
        self.dirty = True
    def mergePools(self, destPool, sourcePool):
        """Merge an existing pool into another.
        The data from sourcePool is merged into destPool.
        The arguments are the names of the pools to be merged.
        The pool named sourcePool is left in tact and you may
        want to call removePool() to get rid of it.
        """
        sp = self.pools[sourcePool]
        dp = self.pools[destPool]
        for tok, count in sp.items():
            if dp.get(tok):
                dp[tok] += count
            else:
                dp[tok] = count
                dp.tokenCount += 1
        self.dirty = True
    def poolData(self, poolName):
        """Return a list of the (token, count) tuples.
        """
        return self.pools[poolName].items()
    def poolTokens(self, poolName):
        """Return a list of the tokens in this pool.
        """
        return [tok for tok, count in self.poolData(poolName)]
    def save(self, fname='bayesdata.dat'):
        from cPickle import dump
        fp = open(fname, 'wb')
        dump(self.pools, fp)
        fp.close()
    def load(self, fname='bayesdata.dat'):
        from cPickle import load
        fp = open(fname, 'rb')
        self.pools = load(fp)
        fp.close()
        self.corpus = self.pools['__Corpus__']
        self.dirty = True
    def poolNames(self):
        """Return a sorted list of Pool names.
        Does not include the system pool '__Corpus__'.
        """
        pools = self.pools.keys()
        pools.remove('__Corpus__')
        pools = [pool for pool in pools]
        pools.sort()
        return pools
    def buildCache(self):
        """ merges corpora and computes probabilities
        """
        self.cache = {}
        for pname, pool in self.pools.items():
            # skip our special pool
            if pname == '__Corpus__':
                continue
            poolCount = len(pool)
            themCount = max(len(self.corpus) - poolCount, 1)
            cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
            for word, totCount in self.corpus.items():
                # for every word in the copus
                # check to see if this pool contains this word
                thisCount = float(pool.get(word, 0.0))
                otherCount = float(totCount) - thisCount
                if not poolCount:
                    goodMetric = 1.0
                else:
                    goodMetric = min(1.0, otherCount/poolCount)
                badMetric = min(1.0, thisCount/themCount)
                f = badMetric / (goodMetric + badMetric)
                # PROBABILITY_THRESHOLD
                if abs(f-0.5) >= 0.1 :
                    # GOOD_PROB, BAD_PROB
                    cacheDict[word] = max(0.0001, min(0.9999, f))
    def poolProbs(self):
        if self.dirty:
            self.buildCache()
            self.dirty = False
        return self.cache
    def getTokens(self, obj):
        """Hopefully it's a string and we'll just split it
        on non-alphanumeric stuff.
        Override this in your subclass for objects other
        than text.
        Alternatively, you can pass in a tokenizer as part of
        instance creation.
        """
        return self.split(obj)
    def getProbs(self, pool, words):
        """ extracts the probabilities of tokens in a message
        """
        probs = [(word, pool[word]) for word in words if word in pool]
        probs.sort(lambda x,y: cmp(y[1],x[1]))
        return probs[:2048]
    def train(self, pool, item, uid=None):
        """Train Bayes by telling him that item belongs
        in pool. uid is optional and may be used to uniquely
        identify the item that is being trained on.
        """
        tokens = self.tokenizer(item)
        pool = self.pools.setdefault(pool, self.dataClass(pool))
        self._train(pool, tokens)
        self.corpus.trainCount += 1
        pool.trainCount += 1
        if uid:
            pool.training.append(uid)
        self.dirty = True
    def untrain(self, pool, item, uid=None):
        tokens = self.tokenizer(item)
        pool = self.pools.get(pool, None)
        if not pool:
            return
        self._untrain(pool, tokens)
        # I guess we want to count this as additional training?
        self.corpus.trainCount += 1
        pool.trainCount += 1
        if uid:
            pool.training.remove(uid)
        self.dirty = True
    def _train(self, pool, tokens):
        wc = 0
        for token in tokens:
            count = pool.get(token, 0)
            pool[token] =  count + 1
            count = self.corpus.get(token, 0)
            self.corpus[token] =  count + 1
            wc += 1
        pool.tokenCount += wc
        self.corpus.tokenCount += wc
    def _untrain(self, pool, tokens):
        for token in tokens:
            count = pool.get(token, 0)
            if count:
                if count == 1:
                    del(pool[token])
                else:
                    pool[token] =  count - 1
                pool.tokenCount -= 1
            count = self.corpus.get(token, 0)
            if count:
                if count == 1:
                    del(self.corpus[token])
                else:
                    self.corpus[token] =  count - 1
                self.corpus.tokenCount -= 1
    def trainedOn(self, msg):            
        for p in self.cache.values():
            if msg in p.training:
                return True
        return False
    def guess(self, msg):
        tokens = Set(self.tokenizer(msg))
        pools = self.poolProbs()
        res = {}
        for pname, pprobs in pools.items():
            p = self.getProbs(pprobs, tokens)
            if len(p) != 0:
                res[pname]=self.combiner(p, pname)
        res = res.items()
        res.sort(lambda x,y: cmp(y[1], x[1]))
        return res        
    def robinson(self, probs, ignore):
        """ computes the probability of a message being spam (Robinson's method)
            P = 1 - prod(1-p)^(1/n)
            Q = 1 - prod(p)^(1/n)
            S = (1 + (P-Q)/(P+Q)) / 2
            Courtesy of http://christophe.delord.free.fr/en/index.html
        """
        nth = 1./len(probs)
        P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
        Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
        S = (P - Q) / (P + Q)
        return (1 + S) / 2
    def robinsonFisher(self, probs, ignore):
        """ computes the probability of a message being spam (Robinson-Fisher method)
            H = C-1( -2.ln(prod(p)), 2*n )
            S = C-1( -2.ln(prod(1-p)), 2*n )
            I = (1 + H - S) / 2
            Courtesy of http://christophe.delord.free.fr/en/index.html
        """
        n = len(probs)
        try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
        except OverflowError: H = 0.0
        try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
        except OverflowError: S = 0.0
        return (1 + H - S) / 2
    def __repr__(self):
        return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
    def __len__(self):
        return len(self.corpus)
 def chi2P(chi, df):
    """ return P(chisq >= chi, with df degree of freedom)
    df must be even
    """
    assert df & 1 == 0
    m = chi / 2.0
    sum = term = math.exp(-m)
    for i in range(1, df/2):
        term *= m/i
        sum += term
    return min(sum, 1.0)
--- a/plugins/Bayes.py
+++ b/plugins/Bayes.py
@ -0,0 +1,204 @@
 ###
 # Copyright (c) 2004, Jeremiah Fincher
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #   * Redistributions of source code must retain the above copyright notice,
 #     this list of conditions, and the following disclaimer.
 #   * Redistributions in binary form must reproduce the above copyright notice,
 #     this list of conditions, and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #   * Neither the name of the author of this software nor the name of
 #     contributors to this software may be used to endorse or promote products
 #     derived from this software without specific prior written consent.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 ###
 """
 Watches for paste-floods in a channel and takes appropriate measures against
 violators.
 """
 import supybot
 __revision__ = "$Id$"
 __author__ = supybot.authors.jemfinch
 __contributors__ = {}
 import supybot.plugins as plugins
 import glob
 import os.path
 import reverend.thomas
 from cStringIO import StringIO as sio
 import supybot.conf as conf
 import supybot.utils as utils
 from supybot.commands import *
 import supybot.ircutils as ircutils
 import supybot.registry as registry
 import supybot.callbacks as callbacks
 def configure(advanced):
    # This will be called by setup.py to configure this module.  Advanced is
    # a bool that specifies whether the user identified himself as an advanced
    # user or not.  You should effect your configuration by manipulating the
    # registry as appropriate.
    from supybot.questions import expect, anything, something, yn
    conf.registerPlugin('Bayes', True)
 Bayes = conf.registerPlugin('Bayes')
 conf.registerChannelValue(Bayes, 'maximumLines',
    registry.NonNegativeInteger(4, """Determines the maximum allowable number
    of consecutive messages that classify as a paste.  If this value is 0, no
    checking will be done."""))
 def tokenize(s):
    return s.lower().split()
 class PickleBayesDB(plugins.DbiChannelDB):
    class DB(object):
        def __init__(self, filename):
            self.filename = filename
            self.nickFilename = self.filename.replace('pickle', 'nick.pickle')
            self.bayes = reverend.thomas.Bayes(tokenize)
            if os.path.exists(filename):
                self.bayes.load(filename)
            self.nickBayes = reverend.thomas.Bayes(tokenize)
            if os.path.exists(nickFilename):
                self.nickBayes.load(self.nickFilename)
        def close(self):
            self.bayes.save(self.filename)
            self.nickBayes.save(self.nickFilename)
        flush = close
        def train(self, kind, s):
            self.bayes.train(kind, s)
        def trainNick(self, nick, s):
            self.nickBayes.train(nick, s)
        def guess(self, s):
            matches = self.bayes.guess(s)
            if matches:
                if matches[0][1] > 0.5:
                    if len(matches) > 1 and \
                       matches[0][1] - matches[1][1] < 0.4:
                        return None
                    else:
                        return matches[0]
            else:
                self.bayes.train('normal', s)
                return None
        def guessNick(self, s):
            L = [t for t in self.nickBayes.guess(s) if t[1] > 0.01]
            if len(L) > 1:
                if L[0][1] / L[1][1] > 2:
                    return [L[0]]
            return L
 BayesDB = plugins.DB('Bayes', {'pickle': PickleBayesDB})
 class Bayes(callbacks.Privmsg):
    def __init__(self):
        self.__parent = super(Bayes, self)
        self.__parent.__init__()
        self.db = BayesDB()
    def die(self):
        self.db.close()
    def doPrivmsg(self, irc, msg):
        (channel, text) = msg.args
        if not ircutils.isChannel(channel) or msg.guessed:
            return
        kind = self.db.guess(channel, text)
        if kind is not None:
            (kind, prob) = kind
            prob *= 100
            text = utils.ellipsisify(text, 30)
            self.log.warning('Classified %r as %s. (%.2f%%)', text, kind, prob)
        self.db.trainNick(channel, msg.nick, text)
    def guess(self, irc, msg, args, channel, text):
        """[<channel>] <text>
        Guesses how <text> should be classified according to the Bayesian
        classifier for <channel>.  <channel> is only necessary if the message
        isn't sent in the channel itself, and then only if
        supybot.databases.plugins.channelSpecific is True.
        """
        msg.tag('guessed')
        kind = self.db.guess(channel, text)
        if kind is not None:
            (kind, prob) = kind
            prob *= 100
            irc.reply('That seems to me to be %s, '
                      'but I\'m only %.2f certain.' % (kind, prob))
        else:
            irc.reply('I don\'t know what the heck that is.')
    guess = wrap(guess, ['channeldb', 'something'])
    def who(self, irc, msg, args, channel, text):
        """[<channel>] <text>
        Guesses who might have said <text>.  <channel> is only necessary if the
        message isn't sent in the channel itself, and then only if
        supybot.databases.plugins.channelSpecific is True.
        """
        msg.tag('guessed')
        kinds = self.db.guessNick(channel, text)
        if kinds:
            if len(kinds) == 1:
                (kind, prob) = kinds.pop()
                irc.reply('It seems to me (with %.2f%% certainty) '
                          'that %s said that.' % (prob*100, kind))
            else:
                kinds = ['%s (%.2f%%)' % (k, prob*100) for (k, prob) in kinds]
                irc.reply('I\'m not quite sure who said that, but it could be '
                          + utils.commaAndify(kinds, And='or'))
        else:
            irc.reply('I have no idea who might\'ve said that.')
    who = wrap(who, ['channeldb', 'something'])
    def train(self, irc, msg, args, channel, language, pattern):
        """[<channel>] <language> <glob>
        Trains the bot to recognize text similar to that contained in the files
        matching <glob> as text of the language <language>.  <channel> is only
        necessary if the message isn't sent in the channel itself, and then
        only if supybot.databases.plugins.channelSpecific is True.
        """
        filenames = glob.glob(pattern)
        if not filenames:
            irc.errorInvalid('glob', pattern)
        for filename in filenames:
            fd = file(filename)
            for line in fd:
                self.db.train(channel, language, line)
            fd.close()
        irc.replySuccess()
    train = wrap(train, ['channeldb', 'something', 'something'])
 Class = Bayes
 # vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: