Skip to content

Commit

Permalink
Merge pull request #84 from juliangilbey/split-damerau-levenshtein
Browse files Browse the repository at this point in the history
Add new DamerauLevenshtein... classes
  • Loading branch information
orsinium authored Sep 18, 2022
2 parents c9fbf57 + efd915c commit 19b7238
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 13 deletions.
30 changes: 24 additions & 6 deletions tests/test_edit/test_damerau_levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
# project
import textdistance


ALG = textdistance.DamerauLevenshtein


@pytest.mark.parametrize('left, right, expected', [
COMMON = [
('test', 'text', 1),
('test', 'tset', 1),
('test', 'qwy', 4),
Expand All @@ -24,15 +22,35 @@
('ab', 'ba', 1),
('ab', 'cde', 3),
('ab', 'ac', 1),
('ab', 'ba', 1),
('ab', 'bc', 2),
]


@pytest.mark.parametrize('left, right, expected', COMMON + [
('ab', 'bca', 3),
('abcd', 'bdac', 4),
])
def test_distance(left, right, expected):
def test_distance_restricted(left, right, expected):
actual = ALG(external=False)(left, right)
assert actual == expected

actual = ALG(external=True)(left, right)
assert actual == expected

actual = ALG()._pure_python(left, right)
actual = ALG()._pure_python_restricted(left, right)
assert actual == expected


@pytest.mark.parametrize('left, right, expected', COMMON + [
('ab', 'bca', 2),
('abcd', 'bdac', 3),
])
def test_distance_unrestricted(left, right, expected):
actual = ALG(external=False, restricted=False)(left, right)
assert actual == expected

actual = ALG(external=True, restricted=False)(left, right)
assert actual == expected

actual = ALG()._pure_python_unrestricted(left, right)
assert actual == expected
55 changes: 52 additions & 3 deletions textdistance/algorithms/edit_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ class DamerauLevenshtein(_Base):
* substitution: ABC -> ABE, ADC, FBC..
* transposition: ABC -> ACB, BAC
If `restricted=False`, it will calculate unrestricted distance,
where the same character can be touched more than once.
So the distance between BA and ACB is 2: BA -> AB -> ACB.
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""

Expand All @@ -156,10 +160,12 @@ def __init__(
qval: int = 1,
test_func: TestFunc | None = None,
external: bool = True,
restricted: bool = True,
) -> None:
self.qval = qval
self.test_func = test_func or self._ident
self.external = external
self.restricted = restricted

def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int:
# TODO: doesn't pass tests, need improve
Expand Down Expand Up @@ -194,11 +200,52 @@ def _numpy(self, s1: Sequence[T], s2: Sequence[T]) -> int:

return d[len(s1) - 1][len(s2) - 1]

def _pure_python(self, s1: Sequence[T], s2: Sequence[T]) -> int:
def _pure_python_unrestricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:
"""https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""
d: dict[tuple[int, int], int] = {}
da: dict[T, int] = {}

len1 = len(s1)
len2 = len(s2)

maxdist = len1 + len2
d[-1, -1] = maxdist

# matrix
for i in range(len(s1) + 1):
d[i, -1] = maxdist
d[i, 0] = i
for j in range(len(s2) + 1):
d[-1, j] = maxdist
d[0, j] = j

for i, cs1 in enumerate(s1, start=1):
db = 0
for j, cs2 in enumerate(s2, start=1):
i1 = da.get(cs2, 0)
j1 = db
if self.test_func(cs1, cs2):
cost = 0
db = j
else:
cost = 1

d[i, j] = min(
d[i - 1, j - 1] + cost, # substitution
d[i, j - 1] + 1, # insertion
d[i - 1, j] + 1, # deletion
d[i1 - 1, j1 - 1] + (i - i1) - 1 + (j - j1), # transposition
)
da[cs1] = i

return d[len1, len2]

def _pure_python_restricted(self, s1: Sequence[T], s2: Sequence[T]) -> int:
"""
https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/
"""
d = {}
d: dict[tuple[int, int], int] = {}

# matrix
for i in range(-1, len(s1) + 1):
Expand Down Expand Up @@ -241,7 +288,9 @@ def __call__(self, s1: Sequence[T], s2: Sequence[T]) -> int:
# if numpy:
# return self._numpy(s1, s2)
# else:
return self._pure_python(s1, s2)
if self.restricted:
return self._pure_python_restricted(s1, s2)
return self._pure_python_unrestricted(s1, s2)


class JaroWinkler(_BaseSimilarity):
Expand Down
4 changes: 4 additions & 0 deletions textdistance/libraries.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"DamerauLevenshtein": [
[
"rapidfuzz.distance.OSA",
"distance"
],
[
"rapidfuzz.distance.DamerauLevenshtein",
"distance"
Expand Down
12 changes: 8 additions & 4 deletions textdistance/libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,14 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
reg = prototype.register

alg = 'DamerauLevenshtein'
reg(alg, LibraryBase('abydos.distance', 'DamerauLevenshtein', presets={}, attr='dist_abs'))
reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance'))
reg(alg, LibraryBase(
'abydos.distance', 'DamerauLevenshtein', presets={}, attr='dist_abs',
conditions=dict(restricted=False),
))
reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance', conditions=dict(restricted=True)))
reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance', conditions=dict(restricted=False)))
reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance', conditions=dict(restricted=False)))
reg(alg, LibraryBase('rapidfuzz.distance.OSA', 'distance', conditions=dict(restricted=True)))

alg = 'Hamming'
reg(alg, LibraryBase('abydos.distance', 'Hamming', presets={}, attr='dist_abs'))
Expand Down

0 comments on commit 19b7238

Please sign in to comment.