mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-01-23 10:34:19 +01:00
Initial checkin.
This commit is contained in:
parent
c563596405
commit
4d27ef28f7
0
others/reverend/__init__.py
Executable file
0
others/reverend/__init__.py
Executable file
96
others/reverend/splitter.py
Executable file
96
others/reverend/splitter.py
Executable file
@ -0,0 +1,96 @@
|
|||||||
|
# This module is part of the Pyndex project and is Copyright 2003 Amir
|
||||||
|
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
|
||||||
|
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
|
||||||
|
# General Public License as published by the Free Software Foundation.
|
||||||
|
|
||||||
|
import string
|
||||||
|
|
||||||
|
class Splitter(object):
|
||||||
|
"""Split plain text into words" utility class
|
||||||
|
Adapted from David Mertz's article in IBM developerWorks
|
||||||
|
Needs work to handle international characters, etc"""
|
||||||
|
|
||||||
|
## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
|
||||||
|
## 'nondigits', 'alpha', 'ident', 'tokens', 'position']
|
||||||
|
|
||||||
|
|
||||||
|
stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
|
||||||
|
'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
|
||||||
|
'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
|
||||||
|
'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
|
||||||
|
'no': 1, 'these': 1, 'of': 1, 'there': 1,
|
||||||
|
'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
|
||||||
|
'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
|
||||||
|
|
||||||
|
yes = string.lowercase + string.digits + '' # throw in any extras
|
||||||
|
nonword = ''
|
||||||
|
for i in range(0,255):
|
||||||
|
if chr(i) not in yes:
|
||||||
|
nonword += chr(i)
|
||||||
|
|
||||||
|
word_only = string.maketrans(nonword, " " * len(nonword))
|
||||||
|
|
||||||
|
nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
|
||||||
|
alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
|
||||||
|
ident = string.join(map(chr, range(256)), '')
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
# Lupy support
|
||||||
|
pass
|
||||||
|
|
||||||
|
def tokenStream(self, fieldName, file, casesensitive=False):
|
||||||
|
"""Split text/plain string into a list of words
|
||||||
|
"""
|
||||||
|
self.tokens = self.split(file.read())
|
||||||
|
self.position = 0
|
||||||
|
return self
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
if self.position >= len(self.tokens):
|
||||||
|
return None
|
||||||
|
res = Token(self.tokens[self.position])
|
||||||
|
self.position += 1
|
||||||
|
return res
|
||||||
|
|
||||||
|
def split(self, text, casesensitive=0):
|
||||||
|
# Speedup trick: attributes into local scope
|
||||||
|
word_only = self.word_only
|
||||||
|
ident = self.ident
|
||||||
|
alpha = self.alpha
|
||||||
|
nondigits = self.nondigits
|
||||||
|
|
||||||
|
# Let's adjust case if not case-sensitive
|
||||||
|
if not casesensitive: text = string.lower(text)
|
||||||
|
|
||||||
|
# Split the raw text
|
||||||
|
allwords = text.translate(word_only).split() # Let's strip funny byte values
|
||||||
|
|
||||||
|
# Finally, let's skip some words not worth indexing
|
||||||
|
words = []
|
||||||
|
for word in allwords:
|
||||||
|
if len(word) > 32: continue # too long (probably gibberish)
|
||||||
|
|
||||||
|
# Identify common patterns in non-word data (binary, UU/MIME, etc)
|
||||||
|
num_nonalpha = len(word.translate(ident, alpha))
|
||||||
|
numdigits = len(word.translate(ident, nondigits))
|
||||||
|
if numdigits > len(word)-2: # almost all digits
|
||||||
|
if numdigits > 5: # too many digits is gibberish
|
||||||
|
continue # a moderate number is year/zipcode/etc
|
||||||
|
elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish
|
||||||
|
continue
|
||||||
|
|
||||||
|
word = word.translate(word_only) # Let's strip funny byte values
|
||||||
|
subwords = word.split() # maybe embedded non-alphanumeric
|
||||||
|
for subword in subwords: # ...so we might have subwords
|
||||||
|
if len(subword) <= 1: continue # too short a subword
|
||||||
|
words.append(subword)
|
||||||
|
|
||||||
|
return words
|
||||||
|
|
||||||
|
class Token:
|
||||||
|
def __init__(self, trmText):
|
||||||
|
self.trmText = trmText
|
||||||
|
|
||||||
|
def termText(self):
|
||||||
|
return self.trmText
|
||||||
|
|
309
others/reverend/thomas.py
Executable file
309
others/reverend/thomas.py
Executable file
@ -0,0 +1,309 @@
|
|||||||
|
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
|
||||||
|
# amir@divmod.org. This is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of version 2.1 of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation.
|
||||||
|
#
|
||||||
|
|
||||||
|
import operator
|
||||||
|
import string
|
||||||
|
import math
|
||||||
|
from sets import Set
|
||||||
|
from splitter import Splitter
|
||||||
|
|
||||||
|
class BayesData(dict):
|
||||||
|
|
||||||
|
def __init__(self, name='', pool=None):
|
||||||
|
self.name = name
|
||||||
|
self.training = []
|
||||||
|
self.pool = pool
|
||||||
|
self.tokenCount = 0
|
||||||
|
self.trainCount = 0
|
||||||
|
|
||||||
|
def trainedOn(self, item):
|
||||||
|
return item in self.training
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Bayes(object):
|
||||||
|
|
||||||
|
def __init__(self, tokenizer=None, combiner=None, dataClass=None):
|
||||||
|
if dataClass is None:
|
||||||
|
self.dataClass = BayesData
|
||||||
|
else:
|
||||||
|
self.dataClass = dataClass
|
||||||
|
self.corpus = self.dataClass('__Corpus__')
|
||||||
|
self.pools = {}
|
||||||
|
self.pools['__Corpus__'] = self.corpus
|
||||||
|
self.trainCount = 0
|
||||||
|
self.splitter = Splitter()
|
||||||
|
self.dirty = True
|
||||||
|
# The tokenizer takes an object and returns
|
||||||
|
# a list of strings
|
||||||
|
if tokenizer is None:
|
||||||
|
self.tokenizer = self.getTokens
|
||||||
|
else:
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
# The combiner combines probabilities
|
||||||
|
if combiner is None:
|
||||||
|
self.combiner = self.robinson
|
||||||
|
else:
|
||||||
|
self.combiner = combiner
|
||||||
|
|
||||||
|
def split(self, text):
|
||||||
|
return self.splitter.split(text)
|
||||||
|
|
||||||
|
def commit(self):
|
||||||
|
self.save()
|
||||||
|
|
||||||
|
def newPool(self, poolName):
|
||||||
|
"""Create a new pool, without actually doing any
|
||||||
|
training.
|
||||||
|
"""
|
||||||
|
self.dirty = True # not always true, but it's simple
|
||||||
|
return self.pools.setdefault(poolName, self.dataClass(poolName))
|
||||||
|
|
||||||
|
def removePool(self, poolName):
|
||||||
|
del(self.pools[poolName])
|
||||||
|
self.dirty = True
|
||||||
|
|
||||||
|
def renamePool(self, poolName, newName):
|
||||||
|
self.pools[newName] = self.pools[poolName]
|
||||||
|
self.pools[newName].name = newName
|
||||||
|
self.removePool(poolName)
|
||||||
|
self.dirty = True
|
||||||
|
|
||||||
|
def mergePools(self, destPool, sourcePool):
|
||||||
|
"""Merge an existing pool into another.
|
||||||
|
The data from sourcePool is merged into destPool.
|
||||||
|
The arguments are the names of the pools to be merged.
|
||||||
|
The pool named sourcePool is left in tact and you may
|
||||||
|
want to call removePool() to get rid of it.
|
||||||
|
"""
|
||||||
|
sp = self.pools[sourcePool]
|
||||||
|
dp = self.pools[destPool]
|
||||||
|
for tok, count in sp.items():
|
||||||
|
if dp.get(tok):
|
||||||
|
dp[tok] += count
|
||||||
|
else:
|
||||||
|
dp[tok] = count
|
||||||
|
dp.tokenCount += 1
|
||||||
|
self.dirty = True
|
||||||
|
|
||||||
|
def poolData(self, poolName):
|
||||||
|
"""Return a list of the (token, count) tuples.
|
||||||
|
"""
|
||||||
|
return self.pools[poolName].items()
|
||||||
|
|
||||||
|
def poolTokens(self, poolName):
|
||||||
|
"""Return a list of the tokens in this pool.
|
||||||
|
"""
|
||||||
|
return [tok for tok, count in self.poolData(poolName)]
|
||||||
|
|
||||||
|
def save(self, fname='bayesdata.dat'):
|
||||||
|
from cPickle import dump
|
||||||
|
fp = open(fname, 'wb')
|
||||||
|
dump(self.pools, fp)
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
def load(self, fname='bayesdata.dat'):
|
||||||
|
from cPickle import load
|
||||||
|
fp = open(fname, 'rb')
|
||||||
|
self.pools = load(fp)
|
||||||
|
fp.close()
|
||||||
|
self.corpus = self.pools['__Corpus__']
|
||||||
|
self.dirty = True
|
||||||
|
|
||||||
|
def poolNames(self):
|
||||||
|
"""Return a sorted list of Pool names.
|
||||||
|
Does not include the system pool '__Corpus__'.
|
||||||
|
"""
|
||||||
|
pools = self.pools.keys()
|
||||||
|
pools.remove('__Corpus__')
|
||||||
|
pools = [pool for pool in pools]
|
||||||
|
pools.sort()
|
||||||
|
return pools
|
||||||
|
|
||||||
|
def buildCache(self):
|
||||||
|
""" merges corpora and computes probabilities
|
||||||
|
"""
|
||||||
|
self.cache = {}
|
||||||
|
for pname, pool in self.pools.items():
|
||||||
|
# skip our special pool
|
||||||
|
if pname == '__Corpus__':
|
||||||
|
continue
|
||||||
|
|
||||||
|
poolCount = len(pool)
|
||||||
|
themCount = max(len(self.corpus) - poolCount, 1)
|
||||||
|
cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
|
||||||
|
|
||||||
|
for word, totCount in self.corpus.items():
|
||||||
|
# for every word in the copus
|
||||||
|
# check to see if this pool contains this word
|
||||||
|
thisCount = float(pool.get(word, 0.0))
|
||||||
|
otherCount = float(totCount) - thisCount
|
||||||
|
|
||||||
|
if not poolCount:
|
||||||
|
goodMetric = 1.0
|
||||||
|
else:
|
||||||
|
goodMetric = min(1.0, otherCount/poolCount)
|
||||||
|
badMetric = min(1.0, thisCount/themCount)
|
||||||
|
f = badMetric / (goodMetric + badMetric)
|
||||||
|
|
||||||
|
# PROBABILITY_THRESHOLD
|
||||||
|
if abs(f-0.5) >= 0.1 :
|
||||||
|
# GOOD_PROB, BAD_PROB
|
||||||
|
cacheDict[word] = max(0.0001, min(0.9999, f))
|
||||||
|
|
||||||
|
def poolProbs(self):
|
||||||
|
if self.dirty:
|
||||||
|
self.buildCache()
|
||||||
|
self.dirty = False
|
||||||
|
return self.cache
|
||||||
|
|
||||||
|
def getTokens(self, obj):
|
||||||
|
"""Hopefully it's a string and we'll just split it
|
||||||
|
on non-alphanumeric stuff.
|
||||||
|
|
||||||
|
Override this in your subclass for objects other
|
||||||
|
than text.
|
||||||
|
|
||||||
|
Alternatively, you can pass in a tokenizer as part of
|
||||||
|
instance creation.
|
||||||
|
"""
|
||||||
|
return self.split(obj)
|
||||||
|
|
||||||
|
def getProbs(self, pool, words):
|
||||||
|
""" extracts the probabilities of tokens in a message
|
||||||
|
"""
|
||||||
|
probs = [(word, pool[word]) for word in words if word in pool]
|
||||||
|
probs.sort(lambda x,y: cmp(y[1],x[1]))
|
||||||
|
return probs[:2048]
|
||||||
|
|
||||||
|
def train(self, pool, item, uid=None):
|
||||||
|
"""Train Bayes by telling him that item belongs
|
||||||
|
in pool. uid is optional and may be used to uniquely
|
||||||
|
identify the item that is being trained on.
|
||||||
|
"""
|
||||||
|
tokens = self.tokenizer(item)
|
||||||
|
pool = self.pools.setdefault(pool, self.dataClass(pool))
|
||||||
|
self._train(pool, tokens)
|
||||||
|
self.corpus.trainCount += 1
|
||||||
|
pool.trainCount += 1
|
||||||
|
if uid:
|
||||||
|
pool.training.append(uid)
|
||||||
|
self.dirty = True
|
||||||
|
|
||||||
|
def untrain(self, pool, item, uid=None):
|
||||||
|
tokens = self.tokenizer(item)
|
||||||
|
pool = self.pools.get(pool, None)
|
||||||
|
if not pool:
|
||||||
|
return
|
||||||
|
self._untrain(pool, tokens)
|
||||||
|
# I guess we want to count this as additional training?
|
||||||
|
self.corpus.trainCount += 1
|
||||||
|
pool.trainCount += 1
|
||||||
|
if uid:
|
||||||
|
pool.training.remove(uid)
|
||||||
|
self.dirty = True
|
||||||
|
|
||||||
|
def _train(self, pool, tokens):
|
||||||
|
wc = 0
|
||||||
|
for token in tokens:
|
||||||
|
count = pool.get(token, 0)
|
||||||
|
pool[token] = count + 1
|
||||||
|
count = self.corpus.get(token, 0)
|
||||||
|
self.corpus[token] = count + 1
|
||||||
|
wc += 1
|
||||||
|
pool.tokenCount += wc
|
||||||
|
self.corpus.tokenCount += wc
|
||||||
|
|
||||||
|
def _untrain(self, pool, tokens):
|
||||||
|
for token in tokens:
|
||||||
|
count = pool.get(token, 0)
|
||||||
|
if count:
|
||||||
|
if count == 1:
|
||||||
|
del(pool[token])
|
||||||
|
else:
|
||||||
|
pool[token] = count - 1
|
||||||
|
pool.tokenCount -= 1
|
||||||
|
|
||||||
|
count = self.corpus.get(token, 0)
|
||||||
|
if count:
|
||||||
|
if count == 1:
|
||||||
|
del(self.corpus[token])
|
||||||
|
else:
|
||||||
|
self.corpus[token] = count - 1
|
||||||
|
self.corpus.tokenCount -= 1
|
||||||
|
|
||||||
|
def trainedOn(self, msg):
|
||||||
|
for p in self.cache.values():
|
||||||
|
if msg in p.training:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def guess(self, msg):
|
||||||
|
tokens = Set(self.tokenizer(msg))
|
||||||
|
pools = self.poolProbs()
|
||||||
|
|
||||||
|
res = {}
|
||||||
|
for pname, pprobs in pools.items():
|
||||||
|
p = self.getProbs(pprobs, tokens)
|
||||||
|
if len(p) != 0:
|
||||||
|
res[pname]=self.combiner(p, pname)
|
||||||
|
res = res.items()
|
||||||
|
res.sort(lambda x,y: cmp(y[1], x[1]))
|
||||||
|
return res
|
||||||
|
|
||||||
|
def robinson(self, probs, ignore):
|
||||||
|
""" computes the probability of a message being spam (Robinson's method)
|
||||||
|
P = 1 - prod(1-p)^(1/n)
|
||||||
|
Q = 1 - prod(p)^(1/n)
|
||||||
|
S = (1 + (P-Q)/(P+Q)) / 2
|
||||||
|
Courtesy of http://christophe.delord.free.fr/en/index.html
|
||||||
|
"""
|
||||||
|
|
||||||
|
nth = 1./len(probs)
|
||||||
|
P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
|
||||||
|
Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
|
||||||
|
S = (P - Q) / (P + Q)
|
||||||
|
return (1 + S) / 2
|
||||||
|
|
||||||
|
|
||||||
|
def robinsonFisher(self, probs, ignore):
|
||||||
|
""" computes the probability of a message being spam (Robinson-Fisher method)
|
||||||
|
H = C-1( -2.ln(prod(p)), 2*n )
|
||||||
|
S = C-1( -2.ln(prod(1-p)), 2*n )
|
||||||
|
I = (1 + H - S) / 2
|
||||||
|
Courtesy of http://christophe.delord.free.fr/en/index.html
|
||||||
|
"""
|
||||||
|
n = len(probs)
|
||||||
|
try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
|
||||||
|
except OverflowError: H = 0.0
|
||||||
|
try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
|
||||||
|
except OverflowError: S = 0.0
|
||||||
|
return (1 + H - S) / 2
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.corpus)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def chi2P(chi, df):
|
||||||
|
""" return P(chisq >= chi, with df degree of freedom)
|
||||||
|
|
||||||
|
df must be even
|
||||||
|
"""
|
||||||
|
assert df & 1 == 0
|
||||||
|
m = chi / 2.0
|
||||||
|
sum = term = math.exp(-m)
|
||||||
|
for i in range(1, df/2):
|
||||||
|
term *= m/i
|
||||||
|
sum += term
|
||||||
|
return min(sum, 1.0)
|
||||||
|
|
204
plugins/Bayes.py
Normal file
204
plugins/Bayes.py
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
###
|
||||||
|
# Copyright (c) 2004, Jeremiah Fincher
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions, and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions, and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of the author of this software nor the name of
|
||||||
|
# contributors to this software may be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written consent.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
###
|
||||||
|
|
||||||
|
"""
|
||||||
|
Watches for paste-floods in a channel and takes appropriate measures against
|
||||||
|
violators.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import supybot
|
||||||
|
|
||||||
|
__revision__ = "$Id$"
|
||||||
|
__author__ = supybot.authors.jemfinch
|
||||||
|
__contributors__ = {}
|
||||||
|
|
||||||
|
import supybot.plugins as plugins
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import os.path
|
||||||
|
import reverend.thomas
|
||||||
|
from cStringIO import StringIO as sio
|
||||||
|
|
||||||
|
import supybot.conf as conf
|
||||||
|
import supybot.utils as utils
|
||||||
|
from supybot.commands import *
|
||||||
|
import supybot.ircutils as ircutils
|
||||||
|
import supybot.registry as registry
|
||||||
|
import supybot.callbacks as callbacks
|
||||||
|
|
||||||
|
|
||||||
|
def configure(advanced):
|
||||||
|
# This will be called by setup.py to configure this module. Advanced is
|
||||||
|
# a bool that specifies whether the user identified himself as an advanced
|
||||||
|
# user or not. You should effect your configuration by manipulating the
|
||||||
|
# registry as appropriate.
|
||||||
|
from supybot.questions import expect, anything, something, yn
|
||||||
|
conf.registerPlugin('Bayes', True)
|
||||||
|
|
||||||
|
Bayes = conf.registerPlugin('Bayes')
|
||||||
|
conf.registerChannelValue(Bayes, 'maximumLines',
|
||||||
|
registry.NonNegativeInteger(4, """Determines the maximum allowable number
|
||||||
|
of consecutive messages that classify as a paste. If this value is 0, no
|
||||||
|
checking will be done."""))
|
||||||
|
|
||||||
|
def tokenize(s):
|
||||||
|
return s.lower().split()
|
||||||
|
|
||||||
|
class PickleBayesDB(plugins.DbiChannelDB):
|
||||||
|
class DB(object):
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.filename = filename
|
||||||
|
self.nickFilename = self.filename.replace('pickle', 'nick.pickle')
|
||||||
|
self.bayes = reverend.thomas.Bayes(tokenize)
|
||||||
|
if os.path.exists(filename):
|
||||||
|
self.bayes.load(filename)
|
||||||
|
self.nickBayes = reverend.thomas.Bayes(tokenize)
|
||||||
|
if os.path.exists(nickFilename):
|
||||||
|
self.nickBayes.load(self.nickFilename)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.bayes.save(self.filename)
|
||||||
|
self.nickBayes.save(self.nickFilename)
|
||||||
|
flush = close
|
||||||
|
|
||||||
|
def train(self, kind, s):
|
||||||
|
self.bayes.train(kind, s)
|
||||||
|
|
||||||
|
def trainNick(self, nick, s):
|
||||||
|
self.nickBayes.train(nick, s)
|
||||||
|
|
||||||
|
def guess(self, s):
|
||||||
|
matches = self.bayes.guess(s)
|
||||||
|
if matches:
|
||||||
|
if matches[0][1] > 0.5:
|
||||||
|
if len(matches) > 1 and \
|
||||||
|
matches[0][1] - matches[1][1] < 0.4:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return matches[0]
|
||||||
|
else:
|
||||||
|
self.bayes.train('normal', s)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def guessNick(self, s):
|
||||||
|
L = [t for t in self.nickBayes.guess(s) if t[1] > 0.01]
|
||||||
|
if len(L) > 1:
|
||||||
|
if L[0][1] / L[1][1] > 2:
|
||||||
|
return [L[0]]
|
||||||
|
return L
|
||||||
|
|
||||||
|
BayesDB = plugins.DB('Bayes', {'pickle': PickleBayesDB})
|
||||||
|
|
||||||
|
class Bayes(callbacks.Privmsg):
|
||||||
|
def __init__(self):
|
||||||
|
self.__parent = super(Bayes, self)
|
||||||
|
self.__parent.__init__()
|
||||||
|
self.db = BayesDB()
|
||||||
|
|
||||||
|
def die(self):
|
||||||
|
self.db.close()
|
||||||
|
|
||||||
|
def doPrivmsg(self, irc, msg):
|
||||||
|
(channel, text) = msg.args
|
||||||
|
if not ircutils.isChannel(channel) or msg.guessed:
|
||||||
|
return
|
||||||
|
kind = self.db.guess(channel, text)
|
||||||
|
if kind is not None:
|
||||||
|
(kind, prob) = kind
|
||||||
|
prob *= 100
|
||||||
|
text = utils.ellipsisify(text, 30)
|
||||||
|
self.log.warning('Classified %r as %s. (%.2f%%)', text, kind, prob)
|
||||||
|
self.db.trainNick(channel, msg.nick, text)
|
||||||
|
|
||||||
|
def guess(self, irc, msg, args, channel, text):
|
||||||
|
"""[<channel>] <text>
|
||||||
|
|
||||||
|
Guesses how <text> should be classified according to the Bayesian
|
||||||
|
classifier for <channel>. <channel> is only necessary if the message
|
||||||
|
isn't sent in the channel itself, and then only if
|
||||||
|
supybot.databases.plugins.channelSpecific is True.
|
||||||
|
"""
|
||||||
|
msg.tag('guessed')
|
||||||
|
kind = self.db.guess(channel, text)
|
||||||
|
if kind is not None:
|
||||||
|
(kind, prob) = kind
|
||||||
|
prob *= 100
|
||||||
|
irc.reply('That seems to me to be %s, '
|
||||||
|
'but I\'m only %.2f certain.' % (kind, prob))
|
||||||
|
else:
|
||||||
|
irc.reply('I don\'t know what the heck that is.')
|
||||||
|
guess = wrap(guess, ['channeldb', 'something'])
|
||||||
|
|
||||||
|
def who(self, irc, msg, args, channel, text):
|
||||||
|
"""[<channel>] <text>
|
||||||
|
|
||||||
|
Guesses who might have said <text>. <channel> is only necessary if the
|
||||||
|
message isn't sent in the channel itself, and then only if
|
||||||
|
supybot.databases.plugins.channelSpecific is True.
|
||||||
|
"""
|
||||||
|
msg.tag('guessed')
|
||||||
|
kinds = self.db.guessNick(channel, text)
|
||||||
|
if kinds:
|
||||||
|
if len(kinds) == 1:
|
||||||
|
(kind, prob) = kinds.pop()
|
||||||
|
irc.reply('It seems to me (with %.2f%% certainty) '
|
||||||
|
'that %s said that.' % (prob*100, kind))
|
||||||
|
else:
|
||||||
|
kinds = ['%s (%.2f%%)' % (k, prob*100) for (k, prob) in kinds]
|
||||||
|
irc.reply('I\'m not quite sure who said that, but it could be '
|
||||||
|
+ utils.commaAndify(kinds, And='or'))
|
||||||
|
else:
|
||||||
|
irc.reply('I have no idea who might\'ve said that.')
|
||||||
|
who = wrap(who, ['channeldb', 'something'])
|
||||||
|
|
||||||
|
def train(self, irc, msg, args, channel, language, pattern):
|
||||||
|
"""[<channel>] <language> <glob>
|
||||||
|
|
||||||
|
|
||||||
|
Trains the bot to recognize text similar to that contained in the files
|
||||||
|
matching <glob> as text of the language <language>. <channel> is only
|
||||||
|
necessary if the message isn't sent in the channel itself, and then
|
||||||
|
only if supybot.databases.plugins.channelSpecific is True.
|
||||||
|
"""
|
||||||
|
filenames = glob.glob(pattern)
|
||||||
|
if not filenames:
|
||||||
|
irc.errorInvalid('glob', pattern)
|
||||||
|
for filename in filenames:
|
||||||
|
fd = file(filename)
|
||||||
|
for line in fd:
|
||||||
|
self.db.train(channel, language, line)
|
||||||
|
fd.close()
|
||||||
|
irc.replySuccess()
|
||||||
|
train = wrap(train, ['channeldb', 'something', 'something'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Class = Bayes
|
||||||
|
|
||||||
|
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:
|
Loading…
Reference in New Issue
Block a user