mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-24 03:29:28 +01:00
add damerau-levenshtein distance to supybot.utils.seq
use it in factoids invalid command to match possible typos write tests for same.
This commit is contained in:
parent
f4d47876d4
commit
5d9273cd5a
@ -52,6 +52,9 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from pysqlite2 import dbapi2 as sqlite3 # for python2.4
|
from pysqlite2 import dbapi2 as sqlite3 # for python2.4
|
||||||
|
|
||||||
|
import re
|
||||||
|
from supybot.utils.seq import dameraulevenshtein
|
||||||
|
|
||||||
# these are needed cuz we are overriding getdb
|
# these are needed cuz we are overriding getdb
|
||||||
import threading
|
import threading
|
||||||
import supybot.world as world
|
import supybot.world as world
|
||||||
@ -217,14 +220,36 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
|
|||||||
#return [t[0] for t in cursor.fetchall()]
|
#return [t[0] for t in cursor.fetchall()]
|
||||||
|
|
||||||
def _searchFactoid(self, channel, key):
|
def _searchFactoid(self, channel, key):
|
||||||
|
"""Try to typo-match input to possible factoids.
|
||||||
|
|
||||||
|
Assume first letter is correct, to reduce processing time.
|
||||||
|
First, try a simple wildcard search.
|
||||||
|
If that fails, use the Damerau-Levenshtein edit-distance metric.
|
||||||
|
"""
|
||||||
|
# if you made a typo in a two-character key, boo on you.
|
||||||
|
if len(key) < 3:
|
||||||
|
return []
|
||||||
|
|
||||||
db = self.getDb(channel)
|
db = self.getDb(channel)
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
key = '%' + key + '%'
|
cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", ('%' + key + '%',))
|
||||||
cursor.execute("""SELECT key FROM keys
|
wildcardkeys = cursor.fetchall()
|
||||||
WHERE key LIKE ?
|
if len(wildcardkeys) > 0:
|
||||||
LIMIT 20""", (key,))
|
return [line[0] for line in wildcardkeys]
|
||||||
return cursor.fetchall()
|
|
||||||
|
|
||||||
|
cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", (key[0] + '%',))
|
||||||
|
flkeys = cursor.fetchall()
|
||||||
|
if len(flkeys) == 0:
|
||||||
|
return []
|
||||||
|
flkeys = [line[0] for line in flkeys]
|
||||||
|
dl_metrics = [dameraulevenshtein(key, sourcekey) for sourcekey in flkeys]
|
||||||
|
dict_metrics = dict(zip(flkeys, dl_metrics))
|
||||||
|
if min(dl_metrics) <= 2:
|
||||||
|
return [key for key,item in dict_metrics.iteritems() if item <= 2]
|
||||||
|
if min(dl_metrics) <= 3:
|
||||||
|
return [key for key,item in dict_metrics.iteritems() if item <= 3]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
def _updateRank(self, channel, factoids):
|
def _updateRank(self, channel, factoids):
|
||||||
if self.registryValue('keepRankInfo', channel):
|
if self.registryValue('keepRankInfo', channel):
|
||||||
@ -281,9 +306,8 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
|
|||||||
else:
|
else:
|
||||||
if self.registryValue('replyWhenInvalidCommandSearchKeys'):
|
if self.registryValue('replyWhenInvalidCommandSearchKeys'):
|
||||||
factoids = self._searchFactoid(channel, key)
|
factoids = self._searchFactoid(channel, key)
|
||||||
#print 'searchfactoids result:', factoids, '>'
|
|
||||||
if factoids:
|
if factoids:
|
||||||
keylist = ["'%s'" % (fact[0],) for fact in factoids]
|
keylist = ["'%s'" % (fact,) for fact in factoids]
|
||||||
keylist = ', '.join(keylist)
|
keylist = ', '.join(keylist)
|
||||||
irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist))
|
irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist))
|
||||||
|
|
||||||
|
@ -165,6 +165,10 @@ class FactoidsTestCase(ChannelPluginTestCase):
|
|||||||
self.assertRegexp('foo', 'bar')
|
self.assertRegexp('foo', 'bar')
|
||||||
self.assertNotError('learn mooz as cowz')
|
self.assertNotError('learn mooz as cowz')
|
||||||
self.assertRegexp('moo', 'mooz')
|
self.assertRegexp('moo', 'mooz')
|
||||||
|
self.assertRegexp('mzo', 'mooz')
|
||||||
|
self.assertRegexp('moz', 'mooz')
|
||||||
|
self.assertNotError('learn moped as pretty fast')
|
||||||
|
self.assertRegexp('moe', 'mooz.*moped')
|
||||||
self.assertError('nosuchthing')
|
self.assertError('nosuchthing')
|
||||||
finally:
|
finally:
|
||||||
conf.supybot.plugins.Factoids.\
|
conf.supybot.plugins.Factoids.\
|
||||||
|
@ -45,7 +45,51 @@ def renumerate(L):
|
|||||||
for i in xrange(len(L)-1, -1, -1):
|
for i in xrange(len(L)-1, -1, -1):
|
||||||
yield (i, L[i])
|
yield (i, L[i])
|
||||||
|
|
||||||
|
def dameraulevenshtein(seq1, seq2):
|
||||||
|
"""Calculate the Damerau-Levenshtein distance between sequences.
|
||||||
|
|
||||||
|
This distance is the number of additions, deletions, substitutions,
|
||||||
|
and transpositions needed to transform the first sequence into the
|
||||||
|
second. Although generally used with strings, any sequences of
|
||||||
|
comparable objects will work.
|
||||||
|
|
||||||
|
Transpositions are exchanges of *consecutive* characters; all other
|
||||||
|
operations are self-explanatory.
|
||||||
|
|
||||||
|
This implementation is O(N*M) time and O(M) space, for N and M the
|
||||||
|
lengths of the two sequences.
|
||||||
|
|
||||||
|
>>> dameraulevenshtein('ba', 'abc')
|
||||||
|
2
|
||||||
|
>>> dameraulevenshtein('fee', 'deed')
|
||||||
|
2
|
||||||
|
|
||||||
|
It works with arbitrary sequences too:
|
||||||
|
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
|
||||||
|
2
|
||||||
|
"""
|
||||||
|
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
|
||||||
|
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
|
||||||
|
# However, only the current and two previous rows are needed at once,
|
||||||
|
# so we only store those.
|
||||||
|
# Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
|
||||||
|
oneago = None
|
||||||
|
thisrow = range(1, len(seq2) + 1) + [0]
|
||||||
|
for x in xrange(len(seq1)):
|
||||||
|
# Python lists wrap around for negative indices, so put the
|
||||||
|
# leftmost column at the *end* of the list. This matches with
|
||||||
|
# the zero-indexed strings and saves extra calculation.
|
||||||
|
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
|
||||||
|
for y in xrange(len(seq2)):
|
||||||
|
delcost = oneago[y] + 1
|
||||||
|
addcost = thisrow[y - 1] + 1
|
||||||
|
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
|
||||||
|
thisrow[y] = min(delcost, addcost, subcost)
|
||||||
|
# This block deals with transpositions
|
||||||
|
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
|
||||||
|
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
|
||||||
|
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
|
||||||
|
return thisrow[len(seq2) - 1]
|
||||||
|
|
||||||
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user