From 4d27ef28f74e105ff244b7ec0e636a765c9de02f Mon Sep 17 00:00:00 2001 From: Jeremy Fincher Date: Sun, 3 Oct 2004 09:08:36 +0000 Subject: [PATCH] Initial checkin. --- others/reverend/__init__.py | 0 others/reverend/splitter.py | 96 +++++++++++ others/reverend/thomas.py | 309 ++++++++++++++++++++++++++++++++++++ plugins/Bayes.py | 204 ++++++++++++++++++++++++ 4 files changed, 609 insertions(+) create mode 100755 others/reverend/__init__.py create mode 100755 others/reverend/splitter.py create mode 100755 others/reverend/thomas.py create mode 100644 plugins/Bayes.py diff --git a/others/reverend/__init__.py b/others/reverend/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/others/reverend/splitter.py b/others/reverend/splitter.py new file mode 100755 index 000000000..04bfea96e --- /dev/null +++ b/others/reverend/splitter.py @@ -0,0 +1,96 @@ +# This module is part of the Pyndex project and is Copyright 2003 Amir +# Bakhtiar (amir@divmod.org). This is free software; you can redistribute +# it and/or modify it under the terms of version 2.1 of the GNU Lesser +# General Public License as published by the Free Software Foundation. + +import string + +class Splitter(object): + """Split plain text into words" utility class + Adapted from David Mertz's article in IBM developerWorks + Needs work to handle international characters, etc""" + +## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword', +## 'nondigits', 'alpha', 'ident', 'tokens', 'position'] + + + stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1, + 'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1, + 'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1, + 'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1, + 'no': 1, 'these': 1, 'of': 1, 'there': 1, + 'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1, + 'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1} + + yes = string.lowercase + string.digits + '' # throw in any extras + nonword = '' + for i in range(0,255): + if chr(i) not in yes: + nonword += chr(i) + + word_only = string.maketrans(nonword, " " * len(nonword)) + + nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '') + alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '') + ident = string.join(map(chr, range(256)), '') + + def close(self): + # Lupy support + pass + + def tokenStream(self, fieldName, file, casesensitive=False): + """Split text/plain string into a list of words + """ + self.tokens = self.split(file.read()) + self.position = 0 + return self + + def next(self): + if self.position >= len(self.tokens): + return None + res = Token(self.tokens[self.position]) + self.position += 1 + return res + + def split(self, text, casesensitive=0): + # Speedup trick: attributes into local scope + word_only = self.word_only + ident = self.ident + alpha = self.alpha + nondigits = self.nondigits + + # Let's adjust case if not case-sensitive + if not casesensitive: text = string.lower(text) + + # Split the raw text + allwords = text.translate(word_only).split() # Let's strip funny byte values + + # Finally, let's skip some words not worth indexing + words = [] + for word in allwords: + if len(word) > 32: continue # too long (probably gibberish) + + # Identify common patterns in non-word data (binary, UU/MIME, etc) + num_nonalpha = len(word.translate(ident, alpha)) + numdigits = len(word.translate(ident, nondigits)) + if numdigits > len(word)-2: # almost all digits + if numdigits > 5: # too many digits is gibberish + continue # a moderate number is year/zipcode/etc + elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish + continue + + word = word.translate(word_only) # Let's strip funny byte values + subwords = word.split() # maybe embedded non-alphanumeric + for subword in subwords: # ...so we might have subwords + if len(subword) <= 1: continue # too short a subword + words.append(subword) + + return words + +class Token: + def __init__(self, trmText): + self.trmText = trmText + + def termText(self): + return self.trmText + diff --git a/others/reverend/thomas.py b/others/reverend/thomas.py new file mode 100755 index 000000000..7c307bbce --- /dev/null +++ b/others/reverend/thomas.py @@ -0,0 +1,309 @@ +# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: +# amir@divmod.org. This is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# + +import operator +import string +import math +from sets import Set +from splitter import Splitter + +class BayesData(dict): + + def __init__(self, name='', pool=None): + self.name = name + self.training = [] + self.pool = pool + self.tokenCount = 0 + self.trainCount = 0 + + def trainedOn(self, item): + return item in self.training + + def __repr__(self): + return '' % (self.name, self.tokenCount) + + + +class Bayes(object): + + def __init__(self, tokenizer=None, combiner=None, dataClass=None): + if dataClass is None: + self.dataClass = BayesData + else: + self.dataClass = dataClass + self.corpus = self.dataClass('__Corpus__') + self.pools = {} + self.pools['__Corpus__'] = self.corpus + self.trainCount = 0 + self.splitter = Splitter() + self.dirty = True + # The tokenizer takes an object and returns + # a list of strings + if tokenizer is None: + self.tokenizer = self.getTokens + else: + self.tokenizer = tokenizer + # The combiner combines probabilities + if combiner is None: + self.combiner = self.robinson + else: + self.combiner = combiner + + def split(self, text): + return self.splitter.split(text) + + def commit(self): + self.save() + + def newPool(self, poolName): + """Create a new pool, without actually doing any + training. + """ + self.dirty = True # not always true, but it's simple + return self.pools.setdefault(poolName, self.dataClass(poolName)) + + def removePool(self, poolName): + del(self.pools[poolName]) + self.dirty = True + + def renamePool(self, poolName, newName): + self.pools[newName] = self.pools[poolName] + self.pools[newName].name = newName + self.removePool(poolName) + self.dirty = True + + def mergePools(self, destPool, sourcePool): + """Merge an existing pool into another. + The data from sourcePool is merged into destPool. + The arguments are the names of the pools to be merged. + The pool named sourcePool is left in tact and you may + want to call removePool() to get rid of it. + """ + sp = self.pools[sourcePool] + dp = self.pools[destPool] + for tok, count in sp.items(): + if dp.get(tok): + dp[tok] += count + else: + dp[tok] = count + dp.tokenCount += 1 + self.dirty = True + + def poolData(self, poolName): + """Return a list of the (token, count) tuples. + """ + return self.pools[poolName].items() + + def poolTokens(self, poolName): + """Return a list of the tokens in this pool. + """ + return [tok for tok, count in self.poolData(poolName)] + + def save(self, fname='bayesdata.dat'): + from cPickle import dump + fp = open(fname, 'wb') + dump(self.pools, fp) + fp.close() + + def load(self, fname='bayesdata.dat'): + from cPickle import load + fp = open(fname, 'rb') + self.pools = load(fp) + fp.close() + self.corpus = self.pools['__Corpus__'] + self.dirty = True + + def poolNames(self): + """Return a sorted list of Pool names. + Does not include the system pool '__Corpus__'. + """ + pools = self.pools.keys() + pools.remove('__Corpus__') + pools = [pool for pool in pools] + pools.sort() + return pools + + def buildCache(self): + """ merges corpora and computes probabilities + """ + self.cache = {} + for pname, pool in self.pools.items(): + # skip our special pool + if pname == '__Corpus__': + continue + + poolCount = len(pool) + themCount = max(len(self.corpus) - poolCount, 1) + cacheDict = self.cache.setdefault(pname, self.dataClass(pname)) + + for word, totCount in self.corpus.items(): + # for every word in the copus + # check to see if this pool contains this word + thisCount = float(pool.get(word, 0.0)) + otherCount = float(totCount) - thisCount + + if not poolCount: + goodMetric = 1.0 + else: + goodMetric = min(1.0, otherCount/poolCount) + badMetric = min(1.0, thisCount/themCount) + f = badMetric / (goodMetric + badMetric) + + # PROBABILITY_THRESHOLD + if abs(f-0.5) >= 0.1 : + # GOOD_PROB, BAD_PROB + cacheDict[word] = max(0.0001, min(0.9999, f)) + + def poolProbs(self): + if self.dirty: + self.buildCache() + self.dirty = False + return self.cache + + def getTokens(self, obj): + """Hopefully it's a string and we'll just split it + on non-alphanumeric stuff. + + Override this in your subclass for objects other + than text. + + Alternatively, you can pass in a tokenizer as part of + instance creation. + """ + return self.split(obj) + + def getProbs(self, pool, words): + """ extracts the probabilities of tokens in a message + """ + probs = [(word, pool[word]) for word in words if word in pool] + probs.sort(lambda x,y: cmp(y[1],x[1])) + return probs[:2048] + + def train(self, pool, item, uid=None): + """Train Bayes by telling him that item belongs + in pool. uid is optional and may be used to uniquely + identify the item that is being trained on. + """ + tokens = self.tokenizer(item) + pool = self.pools.setdefault(pool, self.dataClass(pool)) + self._train(pool, tokens) + self.corpus.trainCount += 1 + pool.trainCount += 1 + if uid: + pool.training.append(uid) + self.dirty = True + + def untrain(self, pool, item, uid=None): + tokens = self.tokenizer(item) + pool = self.pools.get(pool, None) + if not pool: + return + self._untrain(pool, tokens) + # I guess we want to count this as additional training? + self.corpus.trainCount += 1 + pool.trainCount += 1 + if uid: + pool.training.remove(uid) + self.dirty = True + + def _train(self, pool, tokens): + wc = 0 + for token in tokens: + count = pool.get(token, 0) + pool[token] = count + 1 + count = self.corpus.get(token, 0) + self.corpus[token] = count + 1 + wc += 1 + pool.tokenCount += wc + self.corpus.tokenCount += wc + + def _untrain(self, pool, tokens): + for token in tokens: + count = pool.get(token, 0) + if count: + if count == 1: + del(pool[token]) + else: + pool[token] = count - 1 + pool.tokenCount -= 1 + + count = self.corpus.get(token, 0) + if count: + if count == 1: + del(self.corpus[token]) + else: + self.corpus[token] = count - 1 + self.corpus.tokenCount -= 1 + + def trainedOn(self, msg): + for p in self.cache.values(): + if msg in p.training: + return True + return False + + def guess(self, msg): + tokens = Set(self.tokenizer(msg)) + pools = self.poolProbs() + + res = {} + for pname, pprobs in pools.items(): + p = self.getProbs(pprobs, tokens) + if len(p) != 0: + res[pname]=self.combiner(p, pname) + res = res.items() + res.sort(lambda x,y: cmp(y[1], x[1])) + return res + + def robinson(self, probs, ignore): + """ computes the probability of a message being spam (Robinson's method) + P = 1 - prod(1-p)^(1/n) + Q = 1 - prod(p)^(1/n) + S = (1 + (P-Q)/(P+Q)) / 2 + Courtesy of http://christophe.delord.free.fr/en/index.html + """ + + nth = 1./len(probs) + P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth + Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth + S = (P - Q) / (P + Q) + return (1 + S) / 2 + + + def robinsonFisher(self, probs, ignore): + """ computes the probability of a message being spam (Robinson-Fisher method) + H = C-1( -2.ln(prod(p)), 2*n ) + S = C-1( -2.ln(prod(1-p)), 2*n ) + I = (1 + H - S) / 2 + Courtesy of http://christophe.delord.free.fr/en/index.html + """ + n = len(probs) + try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n) + except OverflowError: H = 0.0 + try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n) + except OverflowError: S = 0.0 + return (1 + H - S) / 2 + + def __repr__(self): + return '' % [self.pools[p] for p in self.poolNames()] + + def __len__(self): + return len(self.corpus) + + + +def chi2P(chi, df): + """ return P(chisq >= chi, with df degree of freedom) + + df must be even + """ + assert df & 1 == 0 + m = chi / 2.0 + sum = term = math.exp(-m) + for i in range(1, df/2): + term *= m/i + sum += term + return min(sum, 1.0) + diff --git a/plugins/Bayes.py b/plugins/Bayes.py new file mode 100644 index 000000000..14aef10e6 --- /dev/null +++ b/plugins/Bayes.py @@ -0,0 +1,204 @@ +### +# Copyright (c) 2004, Jeremiah Fincher +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions, and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions, and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the author of this software nor the name of +# contributors to this software may be used to endorse or promote products +# derived from this software without specific prior written consent. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +### + +""" +Watches for paste-floods in a channel and takes appropriate measures against +violators. +""" + +import supybot + +__revision__ = "$Id$" +__author__ = supybot.authors.jemfinch +__contributors__ = {} + +import supybot.plugins as plugins + +import glob +import os.path +import reverend.thomas +from cStringIO import StringIO as sio + +import supybot.conf as conf +import supybot.utils as utils +from supybot.commands import * +import supybot.ircutils as ircutils +import supybot.registry as registry +import supybot.callbacks as callbacks + + +def configure(advanced): + # This will be called by setup.py to configure this module. Advanced is + # a bool that specifies whether the user identified himself as an advanced + # user or not. You should effect your configuration by manipulating the + # registry as appropriate. + from supybot.questions import expect, anything, something, yn + conf.registerPlugin('Bayes', True) + +Bayes = conf.registerPlugin('Bayes') +conf.registerChannelValue(Bayes, 'maximumLines', + registry.NonNegativeInteger(4, """Determines the maximum allowable number + of consecutive messages that classify as a paste. If this value is 0, no + checking will be done.""")) + +def tokenize(s): + return s.lower().split() + +class PickleBayesDB(plugins.DbiChannelDB): + class DB(object): + def __init__(self, filename): + self.filename = filename + self.nickFilename = self.filename.replace('pickle', 'nick.pickle') + self.bayes = reverend.thomas.Bayes(tokenize) + if os.path.exists(filename): + self.bayes.load(filename) + self.nickBayes = reverend.thomas.Bayes(tokenize) + if os.path.exists(nickFilename): + self.nickBayes.load(self.nickFilename) + + def close(self): + self.bayes.save(self.filename) + self.nickBayes.save(self.nickFilename) + flush = close + + def train(self, kind, s): + self.bayes.train(kind, s) + + def trainNick(self, nick, s): + self.nickBayes.train(nick, s) + + def guess(self, s): + matches = self.bayes.guess(s) + if matches: + if matches[0][1] > 0.5: + if len(matches) > 1 and \ + matches[0][1] - matches[1][1] < 0.4: + return None + else: + return matches[0] + else: + self.bayes.train('normal', s) + return None + + def guessNick(self, s): + L = [t for t in self.nickBayes.guess(s) if t[1] > 0.01] + if len(L) > 1: + if L[0][1] / L[1][1] > 2: + return [L[0]] + return L + +BayesDB = plugins.DB('Bayes', {'pickle': PickleBayesDB}) + +class Bayes(callbacks.Privmsg): + def __init__(self): + self.__parent = super(Bayes, self) + self.__parent.__init__() + self.db = BayesDB() + + def die(self): + self.db.close() + + def doPrivmsg(self, irc, msg): + (channel, text) = msg.args + if not ircutils.isChannel(channel) or msg.guessed: + return + kind = self.db.guess(channel, text) + if kind is not None: + (kind, prob) = kind + prob *= 100 + text = utils.ellipsisify(text, 30) + self.log.warning('Classified %r as %s. (%.2f%%)', text, kind, prob) + self.db.trainNick(channel, msg.nick, text) + + def guess(self, irc, msg, args, channel, text): + """[] + + Guesses how should be classified according to the Bayesian + classifier for . is only necessary if the message + isn't sent in the channel itself, and then only if + supybot.databases.plugins.channelSpecific is True. + """ + msg.tag('guessed') + kind = self.db.guess(channel, text) + if kind is not None: + (kind, prob) = kind + prob *= 100 + irc.reply('That seems to me to be %s, ' + 'but I\'m only %.2f certain.' % (kind, prob)) + else: + irc.reply('I don\'t know what the heck that is.') + guess = wrap(guess, ['channeldb', 'something']) + + def who(self, irc, msg, args, channel, text): + """[] + + Guesses who might have said . is only necessary if the + message isn't sent in the channel itself, and then only if + supybot.databases.plugins.channelSpecific is True. + """ + msg.tag('guessed') + kinds = self.db.guessNick(channel, text) + if kinds: + if len(kinds) == 1: + (kind, prob) = kinds.pop() + irc.reply('It seems to me (with %.2f%% certainty) ' + 'that %s said that.' % (prob*100, kind)) + else: + kinds = ['%s (%.2f%%)' % (k, prob*100) for (k, prob) in kinds] + irc.reply('I\'m not quite sure who said that, but it could be ' + + utils.commaAndify(kinds, And='or')) + else: + irc.reply('I have no idea who might\'ve said that.') + who = wrap(who, ['channeldb', 'something']) + + def train(self, irc, msg, args, channel, language, pattern): + """[] + + + Trains the bot to recognize text similar to that contained in the files + matching as text of the language . is only + necessary if the message isn't sent in the channel itself, and then + only if supybot.databases.plugins.channelSpecific is True. + """ + filenames = glob.glob(pattern) + if not filenames: + irc.errorInvalid('glob', pattern) + for filename in filenames: + fd = file(filename) + for line in fd: + self.db.train(channel, language, line) + fd.close() + irc.replySuccess() + train = wrap(train, ['channeldb', 'something', 'something']) + + + +Class = Bayes + +# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: