mirror of
https://github.com/Mikaela/Limnoria.git
synced 2025-01-23 10:34:19 +01:00
Initial checkin.
This commit is contained in:
parent
c563596405
commit
4d27ef28f7
0
others/reverend/__init__.py
Executable file
0
others/reverend/__init__.py
Executable file
96
others/reverend/splitter.py
Executable file
96
others/reverend/splitter.py
Executable file
@ -0,0 +1,96 @@
|
||||
# This module is part of the Pyndex project and is Copyright 2003 Amir
|
||||
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
|
||||
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
|
||||
# General Public License as published by the Free Software Foundation.
|
||||
|
||||
import string
|
||||
|
||||
class Splitter(object):
|
||||
"""Split plain text into words" utility class
|
||||
Adapted from David Mertz's article in IBM developerWorks
|
||||
Needs work to handle international characters, etc"""
|
||||
|
||||
## __slots__ = ['stemmer', 'porter', 'stopwording', 'word_only', 'nonword',
|
||||
## 'nondigits', 'alpha', 'ident', 'tokens', 'position']
|
||||
|
||||
|
||||
stopWords = {'and': 1, 'be': 1, 'to': 1, 'that': 1, 'into': 1,
|
||||
'it': 1, 'but': 1, 'as': 1, 'are': 1, 'they': 1,
|
||||
'in': 1, 'not': 1, 'such': 1, 'with': 1, 'by': 1,
|
||||
'is': 1, 'if': 1, 'a': 1, 'on': 1, 'for': 1,
|
||||
'no': 1, 'these': 1, 'of': 1, 'there': 1,
|
||||
'this': 1, 'will': 1, 'their': 1, 's': 1, 't': 1,
|
||||
'then': 1, 'the': 1, 'was': 1, 'or': 1, 'at': 1}
|
||||
|
||||
yes = string.lowercase + string.digits + '' # throw in any extras
|
||||
nonword = ''
|
||||
for i in range(0,255):
|
||||
if chr(i) not in yes:
|
||||
nonword += chr(i)
|
||||
|
||||
word_only = string.maketrans(nonword, " " * len(nonword))
|
||||
|
||||
nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
|
||||
alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
|
||||
ident = string.join(map(chr, range(256)), '')
|
||||
|
||||
def close(self):
|
||||
# Lupy support
|
||||
pass
|
||||
|
||||
def tokenStream(self, fieldName, file, casesensitive=False):
|
||||
"""Split text/plain string into a list of words
|
||||
"""
|
||||
self.tokens = self.split(file.read())
|
||||
self.position = 0
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
if self.position >= len(self.tokens):
|
||||
return None
|
||||
res = Token(self.tokens[self.position])
|
||||
self.position += 1
|
||||
return res
|
||||
|
||||
def split(self, text, casesensitive=0):
|
||||
# Speedup trick: attributes into local scope
|
||||
word_only = self.word_only
|
||||
ident = self.ident
|
||||
alpha = self.alpha
|
||||
nondigits = self.nondigits
|
||||
|
||||
# Let's adjust case if not case-sensitive
|
||||
if not casesensitive: text = string.lower(text)
|
||||
|
||||
# Split the raw text
|
||||
allwords = text.translate(word_only).split() # Let's strip funny byte values
|
||||
|
||||
# Finally, let's skip some words not worth indexing
|
||||
words = []
|
||||
for word in allwords:
|
||||
if len(word) > 32: continue # too long (probably gibberish)
|
||||
|
||||
# Identify common patterns in non-word data (binary, UU/MIME, etc)
|
||||
num_nonalpha = len(word.translate(ident, alpha))
|
||||
numdigits = len(word.translate(ident, nondigits))
|
||||
if numdigits > len(word)-2: # almost all digits
|
||||
if numdigits > 5: # too many digits is gibberish
|
||||
continue # a moderate number is year/zipcode/etc
|
||||
elif num_nonalpha*2 > len(word): # too much scattered nonalpha = gibberish
|
||||
continue
|
||||
|
||||
word = word.translate(word_only) # Let's strip funny byte values
|
||||
subwords = word.split() # maybe embedded non-alphanumeric
|
||||
for subword in subwords: # ...so we might have subwords
|
||||
if len(subword) <= 1: continue # too short a subword
|
||||
words.append(subword)
|
||||
|
||||
return words
|
||||
|
||||
class Token:
|
||||
def __init__(self, trmText):
|
||||
self.trmText = trmText
|
||||
|
||||
def termText(self):
|
||||
return self.trmText
|
||||
|
309
others/reverend/thomas.py
Executable file
309
others/reverend/thomas.py
Executable file
@ -0,0 +1,309 @@
|
||||
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
|
||||
# amir@divmod.org. This is free software; you can redistribute it and/or
|
||||
# modify it under the terms of version 2.1 of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation.
|
||||
#
|
||||
|
||||
import operator
|
||||
import string
|
||||
import math
|
||||
from sets import Set
|
||||
from splitter import Splitter
|
||||
|
||||
class BayesData(dict):
|
||||
|
||||
def __init__(self, name='', pool=None):
|
||||
self.name = name
|
||||
self.training = []
|
||||
self.pool = pool
|
||||
self.tokenCount = 0
|
||||
self.trainCount = 0
|
||||
|
||||
def trainedOn(self, item):
|
||||
return item in self.training
|
||||
|
||||
def __repr__(self):
|
||||
return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
|
||||
|
||||
|
||||
|
||||
class Bayes(object):
|
||||
|
||||
def __init__(self, tokenizer=None, combiner=None, dataClass=None):
|
||||
if dataClass is None:
|
||||
self.dataClass = BayesData
|
||||
else:
|
||||
self.dataClass = dataClass
|
||||
self.corpus = self.dataClass('__Corpus__')
|
||||
self.pools = {}
|
||||
self.pools['__Corpus__'] = self.corpus
|
||||
self.trainCount = 0
|
||||
self.splitter = Splitter()
|
||||
self.dirty = True
|
||||
# The tokenizer takes an object and returns
|
||||
# a list of strings
|
||||
if tokenizer is None:
|
||||
self.tokenizer = self.getTokens
|
||||
else:
|
||||
self.tokenizer = tokenizer
|
||||
# The combiner combines probabilities
|
||||
if combiner is None:
|
||||
self.combiner = self.robinson
|
||||
else:
|
||||
self.combiner = combiner
|
||||
|
||||
def split(self, text):
|
||||
return self.splitter.split(text)
|
||||
|
||||
def commit(self):
|
||||
self.save()
|
||||
|
||||
def newPool(self, poolName):
|
||||
"""Create a new pool, without actually doing any
|
||||
training.
|
||||
"""
|
||||
self.dirty = True # not always true, but it's simple
|
||||
return self.pools.setdefault(poolName, self.dataClass(poolName))
|
||||
|
||||
def removePool(self, poolName):
|
||||
del(self.pools[poolName])
|
||||
self.dirty = True
|
||||
|
||||
def renamePool(self, poolName, newName):
|
||||
self.pools[newName] = self.pools[poolName]
|
||||
self.pools[newName].name = newName
|
||||
self.removePool(poolName)
|
||||
self.dirty = True
|
||||
|
||||
def mergePools(self, destPool, sourcePool):
|
||||
"""Merge an existing pool into another.
|
||||
The data from sourcePool is merged into destPool.
|
||||
The arguments are the names of the pools to be merged.
|
||||
The pool named sourcePool is left in tact and you may
|
||||
want to call removePool() to get rid of it.
|
||||
"""
|
||||
sp = self.pools[sourcePool]
|
||||
dp = self.pools[destPool]
|
||||
for tok, count in sp.items():
|
||||
if dp.get(tok):
|
||||
dp[tok] += count
|
||||
else:
|
||||
dp[tok] = count
|
||||
dp.tokenCount += 1
|
||||
self.dirty = True
|
||||
|
||||
def poolData(self, poolName):
|
||||
"""Return a list of the (token, count) tuples.
|
||||
"""
|
||||
return self.pools[poolName].items()
|
||||
|
||||
def poolTokens(self, poolName):
|
||||
"""Return a list of the tokens in this pool.
|
||||
"""
|
||||
return [tok for tok, count in self.poolData(poolName)]
|
||||
|
||||
def save(self, fname='bayesdata.dat'):
|
||||
from cPickle import dump
|
||||
fp = open(fname, 'wb')
|
||||
dump(self.pools, fp)
|
||||
fp.close()
|
||||
|
||||
def load(self, fname='bayesdata.dat'):
|
||||
from cPickle import load
|
||||
fp = open(fname, 'rb')
|
||||
self.pools = load(fp)
|
||||
fp.close()
|
||||
self.corpus = self.pools['__Corpus__']
|
||||
self.dirty = True
|
||||
|
||||
def poolNames(self):
|
||||
"""Return a sorted list of Pool names.
|
||||
Does not include the system pool '__Corpus__'.
|
||||
"""
|
||||
pools = self.pools.keys()
|
||||
pools.remove('__Corpus__')
|
||||
pools = [pool for pool in pools]
|
||||
pools.sort()
|
||||
return pools
|
||||
|
||||
def buildCache(self):
|
||||
""" merges corpora and computes probabilities
|
||||
"""
|
||||
self.cache = {}
|
||||
for pname, pool in self.pools.items():
|
||||
# skip our special pool
|
||||
if pname == '__Corpus__':
|
||||
continue
|
||||
|
||||
poolCount = len(pool)
|
||||
themCount = max(len(self.corpus) - poolCount, 1)
|
||||
cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
|
||||
|
||||
for word, totCount in self.corpus.items():
|
||||
# for every word in the copus
|
||||
# check to see if this pool contains this word
|
||||
thisCount = float(pool.get(word, 0.0))
|
||||
otherCount = float(totCount) - thisCount
|
||||
|
||||
if not poolCount:
|
||||
goodMetric = 1.0
|
||||
else:
|
||||
goodMetric = min(1.0, otherCount/poolCount)
|
||||
badMetric = min(1.0, thisCount/themCount)
|
||||
f = badMetric / (goodMetric + badMetric)
|
||||
|
||||
# PROBABILITY_THRESHOLD
|
||||
if abs(f-0.5) >= 0.1 :
|
||||
# GOOD_PROB, BAD_PROB
|
||||
cacheDict[word] = max(0.0001, min(0.9999, f))
|
||||
|
||||
def poolProbs(self):
|
||||
if self.dirty:
|
||||
self.buildCache()
|
||||
self.dirty = False
|
||||
return self.cache
|
||||
|
||||
def getTokens(self, obj):
|
||||
"""Hopefully it's a string and we'll just split it
|
||||
on non-alphanumeric stuff.
|
||||
|
||||
Override this in your subclass for objects other
|
||||
than text.
|
||||
|
||||
Alternatively, you can pass in a tokenizer as part of
|
||||
instance creation.
|
||||
"""
|
||||
return self.split(obj)
|
||||
|
||||
def getProbs(self, pool, words):
|
||||
""" extracts the probabilities of tokens in a message
|
||||
"""
|
||||
probs = [(word, pool[word]) for word in words if word in pool]
|
||||
probs.sort(lambda x,y: cmp(y[1],x[1]))
|
||||
return probs[:2048]
|
||||
|
||||
def train(self, pool, item, uid=None):
|
||||
"""Train Bayes by telling him that item belongs
|
||||
in pool. uid is optional and may be used to uniquely
|
||||
identify the item that is being trained on.
|
||||
"""
|
||||
tokens = self.tokenizer(item)
|
||||
pool = self.pools.setdefault(pool, self.dataClass(pool))
|
||||
self._train(pool, tokens)
|
||||
self.corpus.trainCount += 1
|
||||
pool.trainCount += 1
|
||||
if uid:
|
||||
pool.training.append(uid)
|
||||
self.dirty = True
|
||||
|
||||
def untrain(self, pool, item, uid=None):
|
||||
tokens = self.tokenizer(item)
|
||||
pool = self.pools.get(pool, None)
|
||||
if not pool:
|
||||
return
|
||||
self._untrain(pool, tokens)
|
||||
# I guess we want to count this as additional training?
|
||||
self.corpus.trainCount += 1
|
||||
pool.trainCount += 1
|
||||
if uid:
|
||||
pool.training.remove(uid)
|
||||
self.dirty = True
|
||||
|
||||
def _train(self, pool, tokens):
|
||||
wc = 0
|
||||
for token in tokens:
|
||||
count = pool.get(token, 0)
|
||||
pool[token] = count + 1
|
||||
count = self.corpus.get(token, 0)
|
||||
self.corpus[token] = count + 1
|
||||
wc += 1
|
||||
pool.tokenCount += wc
|
||||
self.corpus.tokenCount += wc
|
||||
|
||||
def _untrain(self, pool, tokens):
|
||||
for token in tokens:
|
||||
count = pool.get(token, 0)
|
||||
if count:
|
||||
if count == 1:
|
||||
del(pool[token])
|
||||
else:
|
||||
pool[token] = count - 1
|
||||
pool.tokenCount -= 1
|
||||
|
||||
count = self.corpus.get(token, 0)
|
||||
if count:
|
||||
if count == 1:
|
||||
del(self.corpus[token])
|
||||
else:
|
||||
self.corpus[token] = count - 1
|
||||
self.corpus.tokenCount -= 1
|
||||
|
||||
def trainedOn(self, msg):
|
||||
for p in self.cache.values():
|
||||
if msg in p.training:
|
||||
return True
|
||||
return False
|
||||
|
||||
def guess(self, msg):
|
||||
tokens = Set(self.tokenizer(msg))
|
||||
pools = self.poolProbs()
|
||||
|
||||
res = {}
|
||||
for pname, pprobs in pools.items():
|
||||
p = self.getProbs(pprobs, tokens)
|
||||
if len(p) != 0:
|
||||
res[pname]=self.combiner(p, pname)
|
||||
res = res.items()
|
||||
res.sort(lambda x,y: cmp(y[1], x[1]))
|
||||
return res
|
||||
|
||||
def robinson(self, probs, ignore):
|
||||
""" computes the probability of a message being spam (Robinson's method)
|
||||
P = 1 - prod(1-p)^(1/n)
|
||||
Q = 1 - prod(p)^(1/n)
|
||||
S = (1 + (P-Q)/(P+Q)) / 2
|
||||
Courtesy of http://christophe.delord.free.fr/en/index.html
|
||||
"""
|
||||
|
||||
nth = 1./len(probs)
|
||||
P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
|
||||
Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
|
||||
S = (P - Q) / (P + Q)
|
||||
return (1 + S) / 2
|
||||
|
||||
|
||||
def robinsonFisher(self, probs, ignore):
|
||||
""" computes the probability of a message being spam (Robinson-Fisher method)
|
||||
H = C-1( -2.ln(prod(p)), 2*n )
|
||||
S = C-1( -2.ln(prod(1-p)), 2*n )
|
||||
I = (1 + H - S) / 2
|
||||
Courtesy of http://christophe.delord.free.fr/en/index.html
|
||||
"""
|
||||
n = len(probs)
|
||||
try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
|
||||
except OverflowError: H = 0.0
|
||||
try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
|
||||
except OverflowError: S = 0.0
|
||||
return (1 + H - S) / 2
|
||||
|
||||
def __repr__(self):
|
||||
return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.corpus)
|
||||
|
||||
|
||||
|
||||
def chi2P(chi, df):
|
||||
""" return P(chisq >= chi, with df degree of freedom)
|
||||
|
||||
df must be even
|
||||
"""
|
||||
assert df & 1 == 0
|
||||
m = chi / 2.0
|
||||
sum = term = math.exp(-m)
|
||||
for i in range(1, df/2):
|
||||
term *= m/i
|
||||
sum += term
|
||||
return min(sum, 1.0)
|
||||
|
204
plugins/Bayes.py
Normal file
204
plugins/Bayes.py
Normal file
@ -0,0 +1,204 @@
|
||||
###
|
||||
# Copyright (c) 2004, Jeremiah Fincher
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions, and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions, and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the author of this software nor the name of
|
||||
# contributors to this software may be used to endorse or promote products
|
||||
# derived from this software without specific prior written consent.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
###
|
||||
|
||||
"""
|
||||
Watches for paste-floods in a channel and takes appropriate measures against
|
||||
violators.
|
||||
"""
|
||||
|
||||
import supybot
|
||||
|
||||
__revision__ = "$Id$"
|
||||
__author__ = supybot.authors.jemfinch
|
||||
__contributors__ = {}
|
||||
|
||||
import supybot.plugins as plugins
|
||||
|
||||
import glob
|
||||
import os.path
|
||||
import reverend.thomas
|
||||
from cStringIO import StringIO as sio
|
||||
|
||||
import supybot.conf as conf
|
||||
import supybot.utils as utils
|
||||
from supybot.commands import *
|
||||
import supybot.ircutils as ircutils
|
||||
import supybot.registry as registry
|
||||
import supybot.callbacks as callbacks
|
||||
|
||||
|
||||
def configure(advanced):
|
||||
# This will be called by setup.py to configure this module. Advanced is
|
||||
# a bool that specifies whether the user identified himself as an advanced
|
||||
# user or not. You should effect your configuration by manipulating the
|
||||
# registry as appropriate.
|
||||
from supybot.questions import expect, anything, something, yn
|
||||
conf.registerPlugin('Bayes', True)
|
||||
|
||||
Bayes = conf.registerPlugin('Bayes')
|
||||
conf.registerChannelValue(Bayes, 'maximumLines',
|
||||
registry.NonNegativeInteger(4, """Determines the maximum allowable number
|
||||
of consecutive messages that classify as a paste. If this value is 0, no
|
||||
checking will be done."""))
|
||||
|
||||
def tokenize(s):
|
||||
return s.lower().split()
|
||||
|
||||
class PickleBayesDB(plugins.DbiChannelDB):
|
||||
class DB(object):
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
self.nickFilename = self.filename.replace('pickle', 'nick.pickle')
|
||||
self.bayes = reverend.thomas.Bayes(tokenize)
|
||||
if os.path.exists(filename):
|
||||
self.bayes.load(filename)
|
||||
self.nickBayes = reverend.thomas.Bayes(tokenize)
|
||||
if os.path.exists(nickFilename):
|
||||
self.nickBayes.load(self.nickFilename)
|
||||
|
||||
def close(self):
|
||||
self.bayes.save(self.filename)
|
||||
self.nickBayes.save(self.nickFilename)
|
||||
flush = close
|
||||
|
||||
def train(self, kind, s):
|
||||
self.bayes.train(kind, s)
|
||||
|
||||
def trainNick(self, nick, s):
|
||||
self.nickBayes.train(nick, s)
|
||||
|
||||
def guess(self, s):
|
||||
matches = self.bayes.guess(s)
|
||||
if matches:
|
||||
if matches[0][1] > 0.5:
|
||||
if len(matches) > 1 and \
|
||||
matches[0][1] - matches[1][1] < 0.4:
|
||||
return None
|
||||
else:
|
||||
return matches[0]
|
||||
else:
|
||||
self.bayes.train('normal', s)
|
||||
return None
|
||||
|
||||
def guessNick(self, s):
|
||||
L = [t for t in self.nickBayes.guess(s) if t[1] > 0.01]
|
||||
if len(L) > 1:
|
||||
if L[0][1] / L[1][1] > 2:
|
||||
return [L[0]]
|
||||
return L
|
||||
|
||||
BayesDB = plugins.DB('Bayes', {'pickle': PickleBayesDB})
|
||||
|
||||
class Bayes(callbacks.Privmsg):
|
||||
def __init__(self):
|
||||
self.__parent = super(Bayes, self)
|
||||
self.__parent.__init__()
|
||||
self.db = BayesDB()
|
||||
|
||||
def die(self):
|
||||
self.db.close()
|
||||
|
||||
def doPrivmsg(self, irc, msg):
|
||||
(channel, text) = msg.args
|
||||
if not ircutils.isChannel(channel) or msg.guessed:
|
||||
return
|
||||
kind = self.db.guess(channel, text)
|
||||
if kind is not None:
|
||||
(kind, prob) = kind
|
||||
prob *= 100
|
||||
text = utils.ellipsisify(text, 30)
|
||||
self.log.warning('Classified %r as %s. (%.2f%%)', text, kind, prob)
|
||||
self.db.trainNick(channel, msg.nick, text)
|
||||
|
||||
def guess(self, irc, msg, args, channel, text):
|
||||
"""[<channel>] <text>
|
||||
|
||||
Guesses how <text> should be classified according to the Bayesian
|
||||
classifier for <channel>. <channel> is only necessary if the message
|
||||
isn't sent in the channel itself, and then only if
|
||||
supybot.databases.plugins.channelSpecific is True.
|
||||
"""
|
||||
msg.tag('guessed')
|
||||
kind = self.db.guess(channel, text)
|
||||
if kind is not None:
|
||||
(kind, prob) = kind
|
||||
prob *= 100
|
||||
irc.reply('That seems to me to be %s, '
|
||||
'but I\'m only %.2f certain.' % (kind, prob))
|
||||
else:
|
||||
irc.reply('I don\'t know what the heck that is.')
|
||||
guess = wrap(guess, ['channeldb', 'something'])
|
||||
|
||||
def who(self, irc, msg, args, channel, text):
|
||||
"""[<channel>] <text>
|
||||
|
||||
Guesses who might have said <text>. <channel> is only necessary if the
|
||||
message isn't sent in the channel itself, and then only if
|
||||
supybot.databases.plugins.channelSpecific is True.
|
||||
"""
|
||||
msg.tag('guessed')
|
||||
kinds = self.db.guessNick(channel, text)
|
||||
if kinds:
|
||||
if len(kinds) == 1:
|
||||
(kind, prob) = kinds.pop()
|
||||
irc.reply('It seems to me (with %.2f%% certainty) '
|
||||
'that %s said that.' % (prob*100, kind))
|
||||
else:
|
||||
kinds = ['%s (%.2f%%)' % (k, prob*100) for (k, prob) in kinds]
|
||||
irc.reply('I\'m not quite sure who said that, but it could be '
|
||||
+ utils.commaAndify(kinds, And='or'))
|
||||
else:
|
||||
irc.reply('I have no idea who might\'ve said that.')
|
||||
who = wrap(who, ['channeldb', 'something'])
|
||||
|
||||
def train(self, irc, msg, args, channel, language, pattern):
|
||||
"""[<channel>] <language> <glob>
|
||||
|
||||
|
||||
Trains the bot to recognize text similar to that contained in the files
|
||||
matching <glob> as text of the language <language>. <channel> is only
|
||||
necessary if the message isn't sent in the channel itself, and then
|
||||
only if supybot.databases.plugins.channelSpecific is True.
|
||||
"""
|
||||
filenames = glob.glob(pattern)
|
||||
if not filenames:
|
||||
irc.errorInvalid('glob', pattern)
|
||||
for filename in filenames:
|
||||
fd = file(filename)
|
||||
for line in fd:
|
||||
self.db.train(channel, language, line)
|
||||
fd.close()
|
||||
irc.replySuccess()
|
||||
train = wrap(train, ['channeldb', 'something', 'something'])
|
||||
|
||||
|
||||
|
||||
Class = Bayes
|
||||
|
||||
# vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78:
|
Loading…
Reference in New Issue
Block a user