From 5d9273cd5a80e4d57cc8e69f55f4de4271db724c Mon Sep 17 00:00:00 2001 From: Daniel Folkinshteyn Date: Wed, 7 Apr 2010 12:33:28 -0400 Subject: [PATCH] add damerau-levenshtein distance to supybot.utils.seq use it in factoids invalid command to match possible typos write tests for same. --- plugins/Factoids/plugin.py | 42 ++++++++++++++++++++++++++++-------- plugins/Factoids/test.py | 4 ++++ src/utils/seq.py | 44 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 9 deletions(-) diff --git a/plugins/Factoids/plugin.py b/plugins/Factoids/plugin.py index 324bda662..707e72023 100644 --- a/plugins/Factoids/plugin.py +++ b/plugins/Factoids/plugin.py @@ -52,6 +52,9 @@ try: except ImportError: from pysqlite2 import dbapi2 as sqlite3 # for python2.4 +import re +from supybot.utils.seq import dameraulevenshtein + # these are needed cuz we are overriding getdb import threading import supybot.world as world @@ -217,15 +220,37 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler): #return [t[0] for t in cursor.fetchall()] def _searchFactoid(self, channel, key): + """Try to typo-match input to possible factoids. + + Assume first letter is correct, to reduce processing time. + First, try a simple wildcard search. + If that fails, use the Damerau-Levenshtein edit-distance metric. + """ + # if you made a typo in a two-character key, boo on you. + if len(key) < 3: + return [] + db = self.getDb(channel) cursor = db.cursor() - key = '%' + key + '%' - cursor.execute("""SELECT key FROM keys - WHERE key LIKE ? - LIMIT 20""", (key,)) - return cursor.fetchall() - - + cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", ('%' + key + '%',)) + wildcardkeys = cursor.fetchall() + if len(wildcardkeys) > 0: + return [line[0] for line in wildcardkeys] + + cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", (key[0] + '%',)) + flkeys = cursor.fetchall() + if len(flkeys) == 0: + return [] + flkeys = [line[0] for line in flkeys] + dl_metrics = [dameraulevenshtein(key, sourcekey) for sourcekey in flkeys] + dict_metrics = dict(zip(flkeys, dl_metrics)) + if min(dl_metrics) <= 2: + return [key for key,item in dict_metrics.iteritems() if item <= 2] + if min(dl_metrics) <= 3: + return [key for key,item in dict_metrics.iteritems() if item <= 3] + + return [] + def _updateRank(self, channel, factoids): if self.registryValue('keepRankInfo', channel): db = self.getDb(channel) @@ -281,9 +306,8 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler): else: if self.registryValue('replyWhenInvalidCommandSearchKeys'): factoids = self._searchFactoid(channel, key) - #print 'searchfactoids result:', factoids, '>' if factoids: - keylist = ["'%s'" % (fact[0],) for fact in factoids] + keylist = ["'%s'" % (fact,) for fact in factoids] keylist = ', '.join(keylist) irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist)) diff --git a/plugins/Factoids/test.py b/plugins/Factoids/test.py index ef9ab6cca..1dddfb19c 100644 --- a/plugins/Factoids/test.py +++ b/plugins/Factoids/test.py @@ -165,6 +165,10 @@ class FactoidsTestCase(ChannelPluginTestCase): self.assertRegexp('foo', 'bar') self.assertNotError('learn mooz as cowz') self.assertRegexp('moo', 'mooz') + self.assertRegexp('mzo', 'mooz') + self.assertRegexp('moz', 'mooz') + self.assertNotError('learn moped as pretty fast') + self.assertRegexp('moe', 'mooz.*moped') self.assertError('nosuchthing') finally: conf.supybot.plugins.Factoids.\ diff --git a/src/utils/seq.py b/src/utils/seq.py index 8df15b226..67b0272be 100644 --- a/src/utils/seq.py +++ b/src/utils/seq.py @@ -45,7 +45,51 @@ def renumerate(L): for i in xrange(len(L)-1, -1, -1): yield (i, L[i]) +def dameraulevenshtein(seq1, seq2): + """Calculate the Damerau-Levenshtein distance between sequences. + This distance is the number of additions, deletions, substitutions, + and transpositions needed to transform the first sequence into the + second. Although generally used with strings, any sequences of + comparable objects will work. + + Transpositions are exchanges of *consecutive* characters; all other + operations are self-explanatory. + + This implementation is O(N*M) time and O(M) space, for N and M the + lengths of the two sequences. + + >>> dameraulevenshtein('ba', 'abc') + 2 + >>> dameraulevenshtein('fee', 'deed') + 2 + + It works with arbitrary sequences too: + >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e']) + 2 + """ + # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F + # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix. + # However, only the current and two previous rows are needed at once, + # so we only store those. + # Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/ + oneago = None + thisrow = range(1, len(seq2) + 1) + [0] + for x in xrange(len(seq1)): + # Python lists wrap around for negative indices, so put the + # leftmost column at the *end* of the list. This matches with + # the zero-indexed strings and saves extra calculation. + twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] + for y in xrange(len(seq2)): + delcost = oneago[y] + 1 + addcost = thisrow[y - 1] + 1 + subcost = oneago[y - 1] + (seq1[x] != seq2[y]) + thisrow[y] = min(delcost, addcost, subcost) + # This block deals with transpositions + if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] + and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]): + thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) + return thisrow[len(seq2) - 1] # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: