2005-01-19 14:14:38 +01:00
|
|
|
###
|
2005-01-27 07:59:08 +01:00
|
|
|
# Copyright (c) 2002-2005, Jeremiah Fincher
|
2005-01-19 14:14:38 +01:00
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions, and the following disclaimer in the
|
|
|
|
# documentation and/or other materials provided with the distribution.
|
|
|
|
# * Neither the name of the author of this software nor the name of
|
|
|
|
# contributors to this software may be used to endorse or promote products
|
|
|
|
# derived from this software without specific prior written consent.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
###
|
|
|
|
|
2005-01-27 07:59:08 +01:00
|
|
|
def window(L, size):
|
2005-05-15 18:12:43 +02:00
|
|
|
"""list * size -> window iterable
|
|
|
|
|
|
|
|
Returns a sliding 'window' through the list L of size size."""
|
2005-01-27 07:59:08 +01:00
|
|
|
assert not isinstance(L, int), 'Argument order swapped: window(L, size)'
|
|
|
|
if size < 1:
|
|
|
|
raise ValueError, 'size <= 0 disallowed.'
|
|
|
|
for i in xrange(len(L) - (size-1)):
|
|
|
|
yield L[i:i+size]
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2005-01-27 07:59:08 +01:00
|
|
|
def mapinto(f, L):
|
|
|
|
for (i, x) in enumerate(L):
|
|
|
|
L[i] = f(x)
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2009-07-01 22:40:58 +02:00
|
|
|
def renumerate(L):
|
|
|
|
for i in xrange(len(L)-1, -1, -1):
|
|
|
|
yield (i, L[i])
|
|
|
|
|
2010-04-07 18:33:28 +02:00
|
|
|
def dameraulevenshtein(seq1, seq2):
|
|
|
|
"""Calculate the Damerau-Levenshtein distance between sequences.
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2010-04-07 18:33:28 +02:00
|
|
|
This distance is the number of additions, deletions, substitutions,
|
|
|
|
and transpositions needed to transform the first sequence into the
|
|
|
|
second. Although generally used with strings, any sequences of
|
|
|
|
comparable objects will work.
|
|
|
|
|
|
|
|
Transpositions are exchanges of *consecutive* characters; all other
|
|
|
|
operations are self-explanatory.
|
|
|
|
|
|
|
|
This implementation is O(N*M) time and O(M) space, for N and M the
|
|
|
|
lengths of the two sequences.
|
|
|
|
|
|
|
|
>>> dameraulevenshtein('ba', 'abc')
|
|
|
|
2
|
|
|
|
>>> dameraulevenshtein('fee', 'deed')
|
|
|
|
2
|
|
|
|
|
|
|
|
It works with arbitrary sequences too:
|
|
|
|
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
|
|
|
|
2
|
|
|
|
"""
|
|
|
|
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
|
|
|
|
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
|
|
|
|
# However, only the current and two previous rows are needed at once,
|
|
|
|
# so we only store those.
|
|
|
|
# Sourced from http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
|
|
|
|
oneago = None
|
|
|
|
thisrow = range(1, len(seq2) + 1) + [0]
|
|
|
|
for x in xrange(len(seq1)):
|
|
|
|
# Python lists wrap around for negative indices, so put the
|
|
|
|
# leftmost column at the *end* of the list. This matches with
|
|
|
|
# the zero-indexed strings and saves extra calculation.
|
|
|
|
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
|
|
|
|
for y in xrange(len(seq2)):
|
|
|
|
delcost = oneago[y] + 1
|
|
|
|
addcost = thisrow[y - 1] + 1
|
|
|
|
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
|
|
|
|
thisrow[y] = min(delcost, addcost, subcost)
|
|
|
|
# This block deals with transpositions
|
|
|
|
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
|
|
|
|
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
|
|
|
|
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
|
|
|
|
return thisrow[len(seq2) - 1]
|
2005-01-19 14:14:38 +01:00
|
|
|
|
2006-02-11 16:52:51 +01:00
|
|
|
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
|
2005-01-19 14:14:38 +01:00
|
|
|
|