add damerau-levenshtein distance to supybot.utils.seq

use it in factoids invalid command to match possible typos
write tests for same.
This commit is contained in:
Daniel Folkinshteyn 2010-04-07 12:33:28 -04:00 committed by Valentin Lorentz
parent bdc8fd5285
commit b306c5440f
3 changed files with 81 additions and 9 deletions

View File

@ -54,6 +54,9 @@ try:
except ImportError:
from pysqlite2 import dbapi2 as sqlite3 # for python2.4
import re
from supybot.utils.seq import dameraulevenshtein
# these are needed cuz we are overriding getdb
import threading
import supybot.world as world
@ -219,15 +222,37 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
#return [t[0] for t in cursor.fetchall()]
def _searchFactoid(self, channel, key):
"""Try to typo-match input to possible factoids.
Assume first letter is correct, to reduce processing time.
First, try a simple wildcard search.
If that fails, use the Damerau-Levenshtein edit-distance metric.
"""
# if you made a typo in a two-character key, boo on you.
if len(key) < 3:
return []
db = self.getDb(channel)
cursor = db.cursor()
key = '%' + key + '%'
cursor.execute("""SELECT key FROM keys
WHERE key LIKE ?
LIMIT 20""", (key,))
return cursor.fetchall()
cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", ('%' + key + '%',))
wildcardkeys = cursor.fetchall()
if len(wildcardkeys) > 0:
return [line[0] for line in wildcardkeys]
cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", (key[0] + '%',))
flkeys = cursor.fetchall()
if len(flkeys) == 0:
return []
flkeys = [line[0] for line in flkeys]
dl_metrics = [dameraulevenshtein(key, sourcekey) for sourcekey in flkeys]
dict_metrics = dict(zip(flkeys, dl_metrics))
if min(dl_metrics) <= 2:
return [key for key,item in dict_metrics.iteritems() if item <= 2]
if min(dl_metrics) <= 3:
return [key for key,item in dict_metrics.iteritems() if item <= 3]
return []
def _updateRank(self, channel, factoids):
if self.registryValue('keepRankInfo', channel):
db = self.getDb(channel)
@ -283,9 +308,8 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
else:
if self.registryValue('replyWhenInvalidCommandSearchKeys'):
factoids = self._searchFactoid(channel, key)
#print 'searchfactoids result:', factoids, '>'
if factoids:
keylist = ["'%s'" % (fact[0],) for fact in factoids]
keylist = ["'%s'" % (fact,) for fact in factoids]
keylist = ', '.join(keylist)
irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist))

View File

@ -165,6 +165,10 @@ class FactoidsTestCase(ChannelPluginTestCase):
self.assertRegexp('foo', 'bar')
self.assertNotError('learn mooz as cowz')
self.assertRegexp('moo', 'mooz')
self.assertRegexp('mzo', 'mooz')
self.assertRegexp('moz', 'mooz')
self.assertNotError('learn moped as pretty fast')
self.assertRegexp('moe', 'mooz.*moped')
self.assertError('nosuchthing')
finally:
conf.supybot.plugins.Factoids.\

View File

@ -45,7 +45,51 @@ def renumerate(L):
for i in xrange(len(L)-1, -1, -1):
yield (i, L[i])
def dameraulevenshtein(seq1, seq2):
"""Calculate the Damerau-Levenshtein distance between sequences.
This distance is the number of additions, deletions, substitutions,
and transpositions needed to transform the first sequence into the
second. Although generally used with strings, any sequences of
comparable objects will work.
Transpositions are exchanges of *consecutive* characters; all other
operations are self-explanatory.
This implementation is O(N*M) time and O(M) space, for N and M the
lengths of the two sequences.
>>> dameraulevenshtein('ba', 'abc')
2
>>> dameraulevenshtein('fee', 'deed')
2
It works with arbitrary sequences too:
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
2
"""
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
# However, only the current and two previous rows are needed at once,
# so we only store those.
# Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
oneago = None
thisrow = range(1, len(seq2) + 1) + [0]
for x in xrange(len(seq1)):
# Python lists wrap around for negative indices, so put the
# leftmost column at the *end* of the list. This matches with
# the zero-indexed strings and saves extra calculation.
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
for y in xrange(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
# This block deals with transpositions
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
return thisrow[len(seq2) - 1]
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: