add damerau-levenshtein distance to supybot.utils.seq

use it in factoids invalid command to match possible typos write tests for same.
2026-02-02 07:57:56 +01:00 · 2010-04-07 12:33:28 -04:00 · 2010-04-07 12:33:28 -04:00 · b306c5440f
commit b306c5440f
parent bdc8fd5285
3 changed files with 81 additions and 9 deletions
--- a/plugins/Factoids/plugin.py
+++ b/plugins/Factoids/plugin.py
@ -54,6 +54,9 @@ try:
 except ImportError:
    from pysqlite2 import dbapi2 as sqlite3 # for python2.4

+import re
+from supybot.utils.seq import dameraulevenshtein
+
 # these are needed cuz we are overriding getdb
 import threading
 import supybot.world as world
@ -219,15 +222,37 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
        #return [t[0] for t in cursor.fetchall()]
    
    def _searchFactoid(self, channel, key):
+        """Try to typo-match input to possible factoids.
+        
+        Assume first letter is correct, to reduce processing time.        
+        First, try a simple wildcard search.
+        If that fails, use the Damerau-Levenshtein edit-distance metric.
+        """
+        # if you made a typo in a two-character key, boo on you.
+        if len(key) < 3:
+            return []
+            
        db = self.getDb(channel)
        cursor = db.cursor()
-        key = '%' + key + '%'
-        cursor.execute("""SELECT key FROM keys
-                          WHERE key LIKE ?
-                          LIMIT 20""", (key,))
-        return cursor.fetchall()
-    
-    
+        cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", ('%' + key + '%',))
+        wildcardkeys = cursor.fetchall()
+        if len(wildcardkeys) > 0:
+            return [line[0] for line in wildcardkeys]
+        
+        cursor.execute("""SELECT key FROM keys WHERE key LIKE ?""", (key[0] + '%',))
+        flkeys = cursor.fetchall()
+        if len(flkeys) == 0:
+            return []
+        flkeys = [line[0] for line in flkeys]
+        dl_metrics = [dameraulevenshtein(key, sourcekey) for sourcekey in flkeys]
+        dict_metrics = dict(zip(flkeys, dl_metrics))
+        if min(dl_metrics) <= 2:
+            return [key for key,item in dict_metrics.iteritems() if item <= 2]
+        if min(dl_metrics) <= 3:
+            return [key for key,item in dict_metrics.iteritems() if item <= 3]
+        
+        return []
+                
    def _updateRank(self, channel, factoids):
        if self.registryValue('keepRankInfo', channel):
            db = self.getDb(channel)
@ -283,9 +308,8 @@ class Factoids(callbacks.Plugin, plugins.ChannelDBHandler):
                else:
                    if self.registryValue('replyWhenInvalidCommandSearchKeys'):
                        factoids = self._searchFactoid(channel, key)
-                        #print 'searchfactoids result:', factoids, '>'
                        if factoids:
-                            keylist = ["'%s'" % (fact[0],) for fact in factoids]
+                            keylist = ["'%s'" % (fact,) for fact in factoids]
                            keylist = ', '.join(keylist)
                            irc.reply("I do not know about '%s', but I do know about these similar topics: %s" % (key, keylist))

--- a/plugins/Factoids/test.py
+++ b/plugins/Factoids/test.py
@ -165,6 +165,10 @@ class FactoidsTestCase(ChannelPluginTestCase):
            self.assertRegexp('foo', 'bar')
            self.assertNotError('learn mooz as cowz')
            self.assertRegexp('moo', 'mooz')
+            self.assertRegexp('mzo', 'mooz')
+            self.assertRegexp('moz', 'mooz')
+            self.assertNotError('learn moped as pretty fast')
+            self.assertRegexp('moe', 'mooz.*moped')
            self.assertError('nosuchthing')
        finally:
            conf.supybot.plugins.Factoids.\
--- a/src/utils/seq.py
+++ b/src/utils/seq.py
@ -45,7 +45,51 @@ def renumerate(L):
    for i in xrange(len(L)-1, -1, -1):
        yield (i, L[i])

+def dameraulevenshtein(seq1, seq2):
+    """Calculate the Damerau-Levenshtein distance between sequences.

+    This distance is the number of additions, deletions, substitutions,
+    and transpositions needed to transform the first sequence into the
+    second. Although generally used with strings, any sequences of
+    comparable objects will work.
+
+    Transpositions are exchanges of *consecutive* characters; all other
+    operations are self-explanatory.
+
+    This implementation is O(N*M) time and O(M) space, for N and M the
+    lengths of the two sequences.
+
+    >>> dameraulevenshtein('ba', 'abc')
+    2
+    >>> dameraulevenshtein('fee', 'deed')
+    2
+
+    It works with arbitrary sequences too:
+    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
+    2
+    """
+    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
+    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
+    # However, only the current and two previous rows are needed at once,
+    # so we only store those.
+    # Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
+    oneago = None
+    thisrow = range(1, len(seq2) + 1) + [0]
+    for x in xrange(len(seq1)):
+        # Python lists wrap around for negative indices, so put the
+        # leftmost column at the *end* of the list. This matches with
+        # the zero-indexed strings and saves extra calculation.
+        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
+        for y in xrange(len(seq2)):
+            delcost = oneago[y] + 1
+            addcost = thisrow[y - 1] + 1
+            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
+            thisrow[y] = min(delcost, addcost, subcost)
+            # This block deals with transpositions
+            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
+                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
+                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
+    return thisrow[len(seq2) - 1]

 # vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: