add damerau-levenshtein distance to supybot.utils.seq

use it in factoids invalid command to match possible typos write tests for same.
2026-01-23 11:07:59 +01:00 · 2010-04-07 12:33:28 -04:00 · 2010-04-07 12:33:28 -04:00 · 5d9273cd5a
commit 5d9273cd5a
parent f4d47876d4
3 changed files with 81 additions and 9 deletions
--- a/plugins/Factoids/plugin.py
+++ b/plugins/Factoids/plugin.py
@ -52,6 +52,9 @@ try:
 except ImportError:
    from pysqlite2 import dbapi2 as sqlite3 # for python2.4
 import re
 from supybot.utils.seq import dameraulevenshtein
 # these are needed cuz we are overriding getdb
 import threading
 import supybot.world as world
@ -217,14 +220,36 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
        #return [t[0] for t in cursor.fetchall()]
    def _searchFactoid(self, channel, key):
        """Try to typo-match input to possible factoids.
        Assume first letter is correct, to reduce processing time.        
        First, try a simple wildcard search.
        If that fails, use the Damerau-Levenshtein edit-distance metric.
        """
        # if you made a typo in a two-character key, boo on you.
        if len(key) < 3:
            return []
        db = self.getDb(channel)
        cursor = db.cursor()
-        key = '%' + key + '%'
+        cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", ('%' + key + '%',))
-        cursor.execute("""SELECT key FROM keys
+        wildcardkeys = cursor.fetchall()
-                          WHERE key LIKE ?
+        if len(wildcardkeys) > 0:
-                          LIMIT 20""", (key,))
+            return [line[0] for line in wildcardkeys]
        return cursor.fetchall()
        cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", (key[0] + '%',))
        flkeys = cursor.fetchall()
        if len(flkeys) == 0:
            return []
        flkeys = [line[0] for line in flkeys]
        dl_metrics = [dameraulevenshtein(key, sourcekey) for sourcekey in flkeys]
        dict_metrics = dict(zip(flkeys, dl_metrics))
        if min(dl_metrics) <= 2:
            return [key for key,item in dict_metrics.iteritems() if item <= 2]
        if min(dl_metrics) <= 3:
            return [key for key,item in dict_metrics.iteritems() if item <= 3]
        return []
    def _updateRank(self, channel, factoids):
        if self.registryValue('keepRankInfo', channel):
@ -281,9 +306,8 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
                else:
                    if self.registryValue('replyWhenInvalidCommandSearchKeys'):
                        factoids = self._searchFactoid(channel, key)
                        #print 'searchfactoids result:', factoids, '>'
                        if factoids:
-                            keylist = ["'%s'" % (fact[0],) for fact in factoids]
+                            keylist = ["'%s'" % (fact,) for fact in factoids]
                            keylist = ', '.join(keylist)
                            irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist))
--- a/plugins/Factoids/test.py
+++ b/plugins/Factoids/test.py
@ -165,6 +165,10 @@ class FactoidsTestCase(ChannelPluginTestCase):
            self.assertRegexp('foo', 'bar')
            self.assertNotError('learn mooz as cowz')
            self.assertRegexp('moo', 'mooz')
            self.assertRegexp('mzo', 'mooz')
            self.assertRegexp('moz', 'mooz')
            self.assertNotError('learn moped as pretty fast')
            self.assertRegexp('moe', 'mooz.*moped')
            self.assertError('nosuchthing')
        finally:
            conf.supybot.plugins.Factoids.\
--- a/src/utils/seq.py
+++ b/src/utils/seq.py
@ -45,7 +45,51 @@ def renumerate(L):
    for i in xrange(len(L)-1, -1, -1):
        yield (i, L[i])
 def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.
    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.
    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.
    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.
    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2
    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
    # However, only the current and two previous rows are needed at once,
    # so we only store those.
    # Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]
 # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: