mirror of
https://github.com/Mikaela/Limnoria.git
synced 2024-11-23 19:19:32 +01:00
add damerau-levenshtein distance to supybot.utils.seq
use it in factoids invalid command to match possible typos write tests for same.
This commit is contained in:
parent
f4d47876d4
commit
5d9273cd5a
@ -52,6 +52,9 @@ try:
|
||||
except ImportError:
|
||||
from pysqlite2 import dbapi2 as sqlite3 # for python2.4
|
||||
|
||||
import re
|
||||
from supybot.utils.seq import dameraulevenshtein
|
||||
|
||||
# these are needed cuz we are overriding getdb
|
||||
import threading
|
||||
import supybot.world as world
|
||||
@ -217,15 +220,37 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
|
||||
#return [t[0] for t in cursor.fetchall()]
|
||||
|
||||
def _searchFactoid(self, channel, key):
|
||||
"""Try to typo-match input to possible factoids.
|
||||
|
||||
Assume first letter is correct, to reduce processing time.
|
||||
First, try a simple wildcard search.
|
||||
If that fails, use the Damerau-Levenshtein edit-distance metric.
|
||||
"""
|
||||
# if you made a typo in a two-character key, boo on you.
|
||||
if len(key) < 3:
|
||||
return []
|
||||
|
||||
db = self.getDb(channel)
|
||||
cursor = db.cursor()
|
||||
key = '%' + key + '%'
|
||||
cursor.execute("""SELECT key FROM keys
|
||||
WHERE key LIKE ?
|
||||
LIMIT 20""", (key,))
|
||||
return cursor.fetchall()
|
||||
|
||||
|
||||
cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", ('%' + key + '%',))
|
||||
wildcardkeys = cursor.fetchall()
|
||||
if len(wildcardkeys) > 0:
|
||||
return [line[0] for line in wildcardkeys]
|
||||
|
||||
cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", (key[0] + '%',))
|
||||
flkeys = cursor.fetchall()
|
||||
if len(flkeys) == 0:
|
||||
return []
|
||||
flkeys = [line[0] for line in flkeys]
|
||||
dl_metrics = [dameraulevenshtein(key, sourcekey) for sourcekey in flkeys]
|
||||
dict_metrics = dict(zip(flkeys, dl_metrics))
|
||||
if min(dl_metrics) <= 2:
|
||||
return [key for key,item in dict_metrics.iteritems() if item <= 2]
|
||||
if min(dl_metrics) <= 3:
|
||||
return [key for key,item in dict_metrics.iteritems() if item <= 3]
|
||||
|
||||
return []
|
||||
|
||||
def _updateRank(self, channel, factoids):
|
||||
if self.registryValue('keepRankInfo', channel):
|
||||
db = self.getDb(channel)
|
||||
@ -281,9 +306,8 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
|
||||
else:
|
||||
if self.registryValue('replyWhenInvalidCommandSearchKeys'):
|
||||
factoids = self._searchFactoid(channel, key)
|
||||
#print 'searchfactoids result:', factoids, '>'
|
||||
if factoids:
|
||||
keylist = ["'%s'" % (fact[0],) for fact in factoids]
|
||||
keylist = ["'%s'" % (fact,) for fact in factoids]
|
||||
keylist = ', '.join(keylist)
|
||||
irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist))
|
||||
|
||||
|
@ -165,6 +165,10 @@ class FactoidsTestCase(ChannelPluginTestCase):
|
||||
self.assertRegexp('foo', 'bar')
|
||||
self.assertNotError('learn mooz as cowz')
|
||||
self.assertRegexp('moo', 'mooz')
|
||||
self.assertRegexp('mzo', 'mooz')
|
||||
self.assertRegexp('moz', 'mooz')
|
||||
self.assertNotError('learn moped as pretty fast')
|
||||
self.assertRegexp('moe', 'mooz.*moped')
|
||||
self.assertError('nosuchthing')
|
||||
finally:
|
||||
conf.supybot.plugins.Factoids.\
|
||||
|
@ -45,7 +45,51 @@ def renumerate(L):
|
||||
for i in xrange(len(L)-1, -1, -1):
|
||||
yield (i, L[i])
|
||||
|
||||
def dameraulevenshtein(seq1, seq2):
|
||||
"""Calculate the Damerau-Levenshtein distance between sequences.
|
||||
|
||||
This distance is the number of additions, deletions, substitutions,
|
||||
and transpositions needed to transform the first sequence into the
|
||||
second. Although generally used with strings, any sequences of
|
||||
comparable objects will work.
|
||||
|
||||
Transpositions are exchanges of *consecutive* characters; all other
|
||||
operations are self-explanatory.
|
||||
|
||||
This implementation is O(N*M) time and O(M) space, for N and M the
|
||||
lengths of the two sequences.
|
||||
|
||||
>>> dameraulevenshtein('ba', 'abc')
|
||||
2
|
||||
>>> dameraulevenshtein('fee', 'deed')
|
||||
2
|
||||
|
||||
It works with arbitrary sequences too:
|
||||
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
|
||||
2
|
||||
"""
|
||||
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
|
||||
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
|
||||
# However, only the current and two previous rows are needed at once,
|
||||
# so we only store those.
|
||||
# Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
|
||||
oneago = None
|
||||
thisrow = range(1, len(seq2) + 1) + [0]
|
||||
for x in xrange(len(seq1)):
|
||||
# Python lists wrap around for negative indices, so put the
|
||||
# leftmost column at the *end* of the list. This matches with
|
||||
# the zero-indexed strings and saves extra calculation.
|
||||
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
|
||||
for y in xrange(len(seq2)):
|
||||
delcost = oneago[y] + 1
|
||||
addcost = thisrow[y - 1] + 1
|
||||
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
|
||||
thisrow[y] = min(delcost, addcost, subcost)
|
||||
# This block deals with transpositions
|
||||
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
|
||||
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
|
||||
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
|
||||
return thisrow[len(seq2) - 1]
|
||||
|
||||
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user