Initial checkin.

This commit is contained in:
Jeremy Fincher 2004-10-03 09:08:36 +00:00
parent c563596405
commit 4d27ef28f7
4 changed files with 609 additions and 0 deletions

0
others/reverend/__init__.py Executable file
View File

96
others/reverend/splitter.py Executable file
View File

@ -0,0 +1,96 @@
# This module is part of the Pyndex project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.
import string
class Splitter(object):
"""Split plain text into words" utility class
Adapted from David Mertz's article in IBM developerWorks
Needs work to handle international characters, etc"""
## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
## 'nondigits', 'alpha', 'ident', 'tokens', 'position']
stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
'no': 1, 'these': 1, 'of': 1, 'there': 1,
'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
yes = string.lowercase + string.digits + '' # throw in any extras
nonword = ''
for i in range(0,255):
if chr(i) not in yes:
nonword += chr(i)
word_only = string.maketrans(nonword, " " * len(nonword))
nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
ident = string.join(map(chr, range(256)), '')
def close(self):
# Lupy support
pass
def tokenStream(self, fieldName, file, casesensitive=False):
"""Split text/plain string into a list of words
"""
self.tokens = self.split(file.read())
self.position = 0
return self
def next(self):
if self.position >= len(self.tokens):
return None
res = Token(self.tokens[self.position])
self.position += 1
return res
def split(self, text, casesensitive=0):
# Speedup trick: attributes into local scope
word_only = self.word_only
ident = self.ident
alpha = self.alpha
nondigits = self.nondigits
# Let's adjust case if not case-sensitive
if not casesensitive: text = string.lower(text)
# Split the raw text
allwords = text.translate(word_only).split() # Let's strip funny byte values
# Finally, let's skip some words not worth indexing
words = []
for word in allwords:
if len(word) > 32: continue # too long (probably gibberish)
# Identify common patterns in non-word data (binary, UU/MIME, etc)
num_nonalpha = len(word.translate(ident, alpha))
numdigits = len(word.translate(ident, nondigits))
if numdigits > len(word)-2: # almost all digits
if numdigits > 5: # too many digits is gibberish
continue # a moderate number is year/zipcode/etc
elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish
continue
word = word.translate(word_only) # Let's strip funny byte values
subwords = word.split() # maybe embedded non-alphanumeric
for subword in subwords: # ...so we might have subwords
if len(subword) <= 1: continue # too short a subword
words.append(subword)
return words
class Token:
def __init__(self, trmText):
self.trmText = trmText
def termText(self):
return self.trmText

309
others/reverend/thomas.py Executable file
View File

@ -0,0 +1,309 @@
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
# amir@divmod.org. This is free software; you can redistribute it and/or
# modify it under the terms of version 2.1 of the GNU Lesser General Public
# License as published by the Free Software Foundation.
#
import operator
import string
import math
from sets import Set
from splitter import Splitter
class BayesData(dict):
def __init__(self, name='', pool=None):
self.name = name
self.training = []
self.pool = pool
self.tokenCount = 0
self.trainCount = 0
def trainedOn(self, item):
return item in self.training
def __repr__(self):
return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
class Bayes(object):
def __init__(self, tokenizer=None, combiner=None, dataClass=None):
if dataClass is None:
self.dataClass = BayesData
else:
self.dataClass = dataClass
self.corpus = self.dataClass('__Corpus__')
self.pools = {}
self.pools['__Corpus__'] = self.corpus
self.trainCount = 0
self.splitter = Splitter()
self.dirty = True
# The tokenizer takes an object and returns
# a list of strings
if tokenizer is None:
self.tokenizer = self.getTokens
else:
self.tokenizer = tokenizer
# The combiner combines probabilities
if combiner is None:
self.combiner = self.robinson
else:
self.combiner = combiner
def split(self, text):
return self.splitter.split(text)
def commit(self):
self.save()
def newPool(self, poolName):
"""Create a new pool, without actually doing any
training.
"""
self.dirty = True # not always true, but it's simple
return self.pools.setdefault(poolName, self.dataClass(poolName))
def removePool(self, poolName):
del(self.pools[poolName])
self.dirty = True
def renamePool(self, poolName, newName):
self.pools[newName] = self.pools[poolName]
self.pools[newName].name = newName
self.removePool(poolName)
self.dirty = True
def mergePools(self, destPool, sourcePool):
"""Merge an existing pool into another.
The data from sourcePool is merged into destPool.
The arguments are the names of the pools to be merged.
The pool named sourcePool is left in tact and you may
want to call removePool() to get rid of it.
"""
sp = self.pools[sourcePool]
dp = self.pools[destPool]
for tok, count in sp.items():
if dp.get(tok):
dp[tok] += count
else:
dp[tok] = count
dp.tokenCount += 1
self.dirty = True
def poolData(self, poolName):
"""Return a list of the (token, count) tuples.
"""
return self.pools[poolName].items()
def poolTokens(self, poolName):
"""Return a list of the tokens in this pool.
"""
return [tok for tok, count in self.poolData(poolName)]
def save(self, fname='bayesdata.dat'):
from cPickle import dump
fp = open(fname, 'wb')
dump(self.pools, fp)
fp.close()
def load(self, fname='bayesdata.dat'):
from cPickle import load
fp = open(fname, 'rb')
self.pools = load(fp)
fp.close()
self.corpus = self.pools['__Corpus__']
self.dirty = True
def poolNames(self):
"""Return a sorted list of Pool names.
Does not include the system pool '__Corpus__'.
"""
pools = self.pools.keys()
pools.remove('__Corpus__')
pools = [pool for pool in pools]
pools.sort()
return pools
def buildCache(self):
""" merges corpora and computes probabilities
"""
self.cache = {}
for pname, pool in self.pools.items():
# skip our special pool
if pname == '__Corpus__':
continue
poolCount = len(pool)
themCount = max(len(self.corpus) - poolCount, 1)
cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
for word, totCount in self.corpus.items():
# for every word in the copus
# check to see if this pool contains this word
thisCount = float(pool.get(word, 0.0))
otherCount = float(totCount) - thisCount
if not poolCount:
goodMetric = 1.0
else:
goodMetric = min(1.0, otherCount/poolCount)
badMetric = min(1.0, thisCount/themCount)
f = badMetric / (goodMetric + badMetric)
# PROBABILITY_THRESHOLD
if abs(f-0.5) >= 0.1 :
# GOOD_PROB, BAD_PROB
cacheDict[word] = max(0.0001, min(0.9999, f))
def poolProbs(self):
if self.dirty:
self.buildCache()
self.dirty = False
return self.cache
def getTokens(self, obj):
"""Hopefully it's a string and we'll just split it
on non-alphanumeric stuff.
Override this in your subclass for objects other
than text.
Alternatively, you can pass in a tokenizer as part of
instance creation.
"""
return self.split(obj)
def getProbs(self, pool, words):
""" extracts the probabilities of tokens in a message
"""
probs = [(word, pool[word]) for word in words if word in pool]
probs.sort(lambda x,y: cmp(y[1],x[1]))
return probs[:2048]
def train(self, pool, item, uid=None):
"""Train Bayes by telling him that item belongs
in pool. uid is optional and may be used to uniquely
identify the item that is being trained on.
"""
tokens = self.tokenizer(item)
pool = self.pools.setdefault(pool, self.dataClass(pool))
self._train(pool, tokens)
self.corpus.trainCount += 1
pool.trainCount += 1
if uid:
pool.training.append(uid)
self.dirty = True
def untrain(self, pool, item, uid=None):
tokens = self.tokenizer(item)
pool = self.pools.get(pool, None)
if not pool:
return
self._untrain(pool, tokens)
# I guess we want to count this as additional training?
self.corpus.trainCount += 1
pool.trainCount += 1
if uid:
pool.training.remove(uid)
self.dirty = True
def _train(self, pool, tokens):
wc = 0
for token in tokens:
count = pool.get(token, 0)
pool[token] = count + 1
count = self.corpus.get(token, 0)
self.corpus[token] = count + 1
wc += 1
pool.tokenCount += wc
self.corpus.tokenCount += wc
def _untrain(self, pool, tokens):
for token in tokens:
count = pool.get(token, 0)
if count:
if count == 1:
del(pool[token])
else:
pool[token] = count - 1
pool.tokenCount -= 1
count = self.corpus.get(token, 0)
if count:
if count == 1:
del(self.corpus[token])
else:
self.corpus[token] = count - 1
self.corpus.tokenCount -= 1
def trainedOn(self, msg):
for p in self.cache.values():
if msg in p.training:
return True
return False
def guess(self, msg):
tokens = Set(self.tokenizer(msg))
pools = self.poolProbs()
res = {}
for pname, pprobs in pools.items():
p = self.getProbs(pprobs, tokens)
if len(p) != 0:
res[pname]=self.combiner(p, pname)
res = res.items()
res.sort(lambda x,y: cmp(y[1], x[1]))
return res
def robinson(self, probs, ignore):
""" computes the probability of a message being spam (Robinson's method)
P = 1 - prod(1-p)^(1/n)
Q = 1 - prod(p)^(1/n)
S = (1 + (P-Q)/(P+Q)) / 2
Courtesy of http://christophe.delord.free.fr/en/index.html
"""
nth = 1./len(probs)
P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
S = (P - Q) / (P + Q)
return (1 + S) / 2
def robinsonFisher(self, probs, ignore):
""" computes the probability of a message being spam (Robinson-Fisher method)
H = C-1( -2.ln(prod(p)), 2*n )
S = C-1( -2.ln(prod(1-p)), 2*n )
I = (1 + H - S) / 2
Courtesy of http://christophe.delord.free.fr/en/index.html
"""
n = len(probs)
try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
except OverflowError: H = 0.0
try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
except OverflowError: S = 0.0
return (1 + H - S) / 2
def __repr__(self):
return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
def __len__(self):
return len(self.corpus)
def chi2P(chi, df):
""" return P(chisq >= chi, with df degree of freedom)
df must be even
"""
assert df & 1 == 0
m = chi / 2.0
sum = term = math.exp(-m)
for i in range(1, df/2):
term *= m/i
sum += term
return min(sum, 1.0)

204
plugins/Bayes.py Normal file
View File

@ -0,0 +1,204 @@
###
# Copyright (c) 2004, Jeremiah Fincher
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions, and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions, and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the author of this software nor the name of
# contributors to this software may be used to endorse or promote products
# derived from this software without specific prior written consent.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###
"""
Watches for paste-floods in a channel and takes appropriate measures against
violators.
"""
import supybot
__revision__ = "$Id$"
__author__ = supybot.authors.jemfinch
__contributors__ = {}
import supybot.plugins as plugins
import glob
import os.path
import reverend.thomas
from cStringIO import StringIO as sio
import supybot.conf as conf
import supybot.utils as utils
from supybot.commands import *
import supybot.ircutils as ircutils
import supybot.registry as registry
import supybot.callbacks as callbacks
def configure(advanced):
# This will be called by setup.py to configure this module. Advanced is
# a bool that specifies whether the user identified himself as an advanced
# user or not. You should effect your configuration by manipulating the
# registry as appropriate.
from supybot.questions import expect, anything, something, yn
conf.registerPlugin('Bayes', True)
Bayes = conf.registerPlugin('Bayes')
conf.registerChannelValue(Bayes, 'maximumLines',
registry.NonNegativeInteger(4, """Determines the maximum allowable number
of consecutive messages that classify as a paste. If this value is 0, no
checking will be done."""))
def tokenize(s):
return s.lower().split()
class PickleBayesDB(plugins.DbiChannelDB):
class DB(object):
def __init__(self, filename):
self.filename = filename
self.nickFilename = self.filename.replace('pickle', 'nick.pickle')
self.bayes = reverend.thomas.Bayes(tokenize)
if os.path.exists(filename):
self.bayes.load(filename)
self.nickBayes = reverend.thomas.Bayes(tokenize)
if os.path.exists(nickFilename):
self.nickBayes.load(self.nickFilename)
def close(self):
self.bayes.save(self.filename)
self.nickBayes.save(self.nickFilename)
flush = close
def train(self, kind, s):
self.bayes.train(kind, s)
def trainNick(self, nick, s):
self.nickBayes.train(nick, s)
def guess(self, s):
matches = self.bayes.guess(s)
if matches:
if matches[0][1] > 0.5:
if len(matches) > 1 and \
matches[0][1] - matches[1][1] < 0.4:
return None
else:
return matches[0]
else:
self.bayes.train('normal', s)
return None
def guessNick(self, s):
L = [t for t in self.nickBayes.guess(s) if t[1] > 0.01]
if len(L) > 1:
if L[0][1] / L[1][1] > 2:
return [L[0]]
return L
BayesDB = plugins.DB('Bayes', {'pickle': PickleBayesDB})
class Bayes(callbacks.Privmsg):
def __init__(self):
self.__parent = super(Bayes, self)
self.__parent.__init__()
self.db = BayesDB()
def die(self):
self.db.close()
def doPrivmsg(self, irc, msg):
(channel, text) = msg.args
if not ircutils.isChannel(channel) or msg.guessed:
return
kind = self.db.guess(channel, text)
if kind is not None:
(kind, prob) = kind
prob *= 100
text = utils.ellipsisify(text, 30)
self.log.warning('Classified %r as %s. (%.2f%%)', text, kind, prob)
self.db.trainNick(channel, msg.nick, text)
def guess(self, irc, msg, args, channel, text):
"""[<channel>] <text>
Guesses how <text> should be classified according to the Bayesian
classifier for <channel>. <channel> is only necessary if the message
isn't sent in the channel itself, and then only if
supybot.databases.plugins.channelSpecific is True.
"""
msg.tag('guessed')
kind = self.db.guess(channel, text)
if kind is not None:
(kind, prob) = kind
prob *= 100
irc.reply('That seems to me to be %s, '
'but I\'m only %.2f certain.' % (kind, prob))
else:
irc.reply('I don\'t know what the heck that is.')
guess = wrap(guess, ['channeldb', 'something'])
def who(self, irc, msg, args, channel, text):
"""[<channel>] <text>
Guesses who might have said <text>. <channel> is only necessary if the
message isn't sent in the channel itself, and then only if
supybot.databases.plugins.channelSpecific is True.
"""
msg.tag('guessed')
kinds = self.db.guessNick(channel, text)
if kinds:
if len(kinds) == 1:
(kind, prob) = kinds.pop()
irc.reply('It seems to me (with %.2f%% certainty) '
'that %s said that.' % (prob*100, kind))
else:
kinds = ['%s (%.2f%%)' % (k, prob*100) for (k, prob) in kinds]
irc.reply('I\'m not quite sure who said that, but it could be '
+ utils.commaAndify(kinds, And='or'))
else:
irc.reply('I have no idea who might\'ve said that.')
who = wrap(who, ['channeldb', 'something'])
def train(self, irc, msg, args, channel, language, pattern):
"""[<channel>] <language> <glob>
Trains the bot to recognize text similar to that contained in the files
matching <glob> as text of the language <language>. <channel> is only
necessary if the message isn't sent in the channel itself, and then
only if supybot.databases.plugins.channelSpecific is True.
"""
filenames = glob.glob(pattern)
if not filenames:
irc.errorInvalid('glob', pattern)
for filename in filenames:
fd = file(filename)
for line in fd:
self.db.train(channel, language, line)
fd.close()
irc.replySuccess()
train = wrap(train, ['channeldb', 'something', 'something'])
Class = Bayes
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: