Skip to content

Commit

Permalink
Add new DamerauLevenshteinRestricted and DamerauLevenshteinUnrestrict…
Browse files Browse the repository at this point in the history
…ed classes
  • Loading branch information
Julian Gilbey committed Sep 5, 2022
1 parent 64c121e commit 742edf5
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 15 deletions.
17 changes: 10 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ Features:
| [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) | `Hamming` | `hamming` |
| [MLIPNS](http://www.sial.iias.spb.su/files/386-386-1-PB.pdf) | `Mlipns` | `mlipns` |
| [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) | `Levenshtein` | `levenshtein` |
| [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) | `DamerauLevenshtein` | `damerau_levenshtein` |
| [Damerau-Levenshtein unrestricted](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) | `DamerauLevenshteinUnrestricted` | `damerau_levenshtein` |
| [Damerau-Levenshtein unrestricted](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) | `DamerauLevenshtein` | `damerau_levenshtein` |
| [Damerau-Levenshtein restricted](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) | `DamerauLevenshteinRestricted` | `damerau_levenshtein` |
| [Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) | `JaroWinkler` | `jaro_winkler`, `jaro` |
| [Strcmp95](http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c) | `StrCmp95` | `strcmp95` |
| [Needleman-Wunsch](https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm) | `NeedlemanWunsch` | `needleman_wunsch` |
Expand Down Expand Up @@ -126,7 +128,7 @@ With algorithm specific extras:
pip install "textdistance[Hamming]"
```

Algorithms with available extras: `DamerauLevenshtein`, `Hamming`, `Jaro`, `JaroWinkler`, `Levenshtein`.
Algorithms with available extras: `DamerauLevenshtein` (both versions), `Hamming`, `Jaro`, `JaroWinkler`, `Levenshtein`.

### Dev

Expand Down Expand Up @@ -242,11 +244,12 @@ Without extras installation:

| algorithm | library | function | time |
|-----------|---------|----------|------|
| DamerauLevenshtein | jellyfish | damerau_levenshtein_distance | 0.00965294 |
| DamerauLevenshtein | pyxdameraulevenshtein | damerau_levenshtein_distance | 0.151378 |
| DamerauLevenshtein | pylev | damerau_levenshtein | 0.766461 |
| DamerauLevenshtein | **textdistance** | DamerauLevenshtein | 4.13463 |
| DamerauLevenshtein | abydos | damerau_levenshtein | 4.3831 |
| DamerauLevenshteinUnrestricted | jellyfish | damerau_levenshtein_distance | 0.00965294 |
| DamerauLevenshteinUnrestricted | **textdistance** | DamerauLevenshteinUnrestricted | 1.130407 |
| DamerauLevenshteinUnrestricted | abydos | damerau_levenshtein | 4.3831 |
| DamerauLevenshteinRestricted | pyxdameraulevenshtein | damerau_levenshtein_distance | 0.151378 |
| DamerauLevenshteinRestricted | pylev | damerau_levenshtein | 0.766461 |
| DamerauLevenshteinRestricted | **textdistance** | DamerauLevenshteinRestricted | 4.13463 |
| Hamming | Levenshtein | hamming | 0.0014428 |
| Hamming | jellyfish | hamming_distance | 0.00240262 |
| Hamming | distance | hamming | 0.036253 |
Expand Down
74 changes: 70 additions & 4 deletions tests/test_edit/test_damerau_levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,37 @@
import textdistance


ALG = textdistance.DamerauLevenshtein
@pytest.mark.parametrize('left, right, expected', [
('test', 'text', 1),
('test', 'tset', 1),
('test', 'qwy', 4),
('test', 'testit', 2),
('test', 'tesst', 1),
('test', 'tet', 1),
('cat', 'hat', 1),
('Niall', 'Neil', 3),
('aluminum', 'Catalan', 7),
('ATCG', 'TAGC', 2),
('ab', 'ba', 1),
('ab', 'cde', 3),
('ab', 'ac', 1),
('ab', 'bc', 2),
('ab', 'bca', 3),
('abcd', 'bdac', 4),
])
def test_distance_restricted(left, right, expected):
alg = textdistance.DamerauLevenshteinRestricted

actual = alg(external=False)(left, right)
assert actual == expected

actual = alg(external=True)(left, right)
assert actual == expected

actual = alg()._pure_python(left, right)
assert actual == expected


@pytest.mark.parametrize('left, right, expected', [
Expand All @@ -24,15 +54,51 @@
('ab', 'ba', 1),
('ab', 'cde', 3),
('ab', 'ac', 1),
('ab', 'bc', 2),
('ab', 'bca', 2),
('abcd', 'bdac', 3),
])
def test_distance_unrestricted(left, right, expected):
alg = textdistance.DamerauLevenshteinUnrestricted

actual = alg(external=False)(left, right)
assert actual == expected

actual = alg(external=True)(left, right)
assert actual == expected

actual = alg()._pure_python(left, right)
assert actual == expected


@pytest.mark.parametrize('left, right, expected', [
('test', 'text', 1),
('test', 'tset', 1),
('test', 'qwy', 4),
('test', 'testit', 2),
('test', 'tesst', 1),
('test', 'tet', 1),
('cat', 'hat', 1),
('Niall', 'Neil', 3),
('aluminum', 'Catalan', 7),
('ATCG', 'TAGC', 2),
('ab', 'ba', 1),
('ab', 'cde', 3),
('ab', 'ac', 1),
('ab', 'bc', 2),
('ab', 'bca', 2),
('abcd', 'bdac', 3),
])
def test_distance(left, right, expected):
actual = ALG(external=False)(left, right)
alg = textdistance.DamerauLevenshtein

actual = alg(external=False)(left, right)
assert actual == expected

actual = ALG(external=True)(left, right)
actual = alg(external=True)(left, right)
assert actual == expected

actual = ALG()._pure_python(left, right)
actual = alg()._pure_python(left, right)
assert actual == expected
89 changes: 86 additions & 3 deletions textdistance/algorithms/edit_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

__all__ = [
'Hamming', 'MLIPNS',
'Levenshtein', 'DamerauLevenshtein',
'Levenshtein', 'DamerauLevenshteinUnrestricted',
'DamerauLevenshteinRestricted', 'DamerauLevenshtein',
'Jaro', 'JaroWinkler', 'StrCmp95',
'NeedlemanWunsch', 'Gotoh', 'SmithWaterman',

Expand Down Expand Up @@ -119,9 +120,9 @@ def __call__(self, s1, s2):
return self._cicled(s1, s2)


class DamerauLevenshtein(_Base):
class DamerauLevenshteinRestricted(_Base):
"""
Compute the absolute Damerau-Levenshtein distance between the two sequences.
Compute the absolute restricted Damerau-Levenshtein distance between the two sequences.
The Damerau-Levenshtein distance is the minimum number of edit operations necessary
for transforming one sequence into the other. The edit operations allowed are:
Expand All @@ -130,6 +131,11 @@ class DamerauLevenshtein(_Base):
* substitution: ABC -> ABE, ADC, FBC..
* transposition: ABC -> ACB, BAC
This class calculates the restricted distance, where the same character
cannot be touched more than once. So the distance between BA and ACB
is 3: BA -> A -> AC -> ACB. Note that BA -> AB -> ACB is disallowed
as the transposition requires AB to remain unchanged thereafter.
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""

Expand Down Expand Up @@ -221,6 +227,83 @@ def __call__(self, s1, s2):
return self._pure_python(s1, s2)


class DamerauLevenshteinUnrestricted(_Base):
"""
Compute the absolute (unrestricted) Damerau-Levenshtein distance between the two sequences.
The Damerau-Levenshtein distance is the minimum number of edit operations necessary
for transforming one sequence into the other. The edit operations allowed are:
* deletion: ABC -> BC, AC, AB
* insertion: ABC -> ABCD, EABC, AEBC..
* substitution: ABC -> ABE, ADC, FBC..
* transposition: ABC -> ACB, BAC
This class calculates the unrestricted distance, where the same character
can be touched more than once. So the distance between BA and ACB
is 2: BA -> AB -> ACB.
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""
def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
self.external = external

def _pure_python(self, s1, s2):
# Based on the Wikipedia code
d = {}
da = defaultdict(int)

len1 = len(s1)
len2 = len(s2)

maxdist = len1 + len2
d[-1, -1] = maxdist

# matrix
for i in range(len(s1) + 1):
d[i, -1] = maxdist
d[i, 0] = i
for j in range(len(s2) + 1):
d[-1, j] = maxdist
d[0, j] = j

for i, cs1 in enumerate(s1):
i += 1
db = 0
for j, cs2 in enumerate(s2):
j += 1
i1 = da[cs2]
j1 = db
if self.test_func(cs1, cs2):
cost = 0
db = j
else:
cost = 1

d[i, j] = min(
d[i - 1, j - 1] + cost, # substitution
d[i, j - 1] + 1, # insertion
d[i - 1, j] + 1, # deletion
d[i1 - 1, j1 - 1] + (i - i1) - 1 + (j - j1), # transposition
)
da[cs1] = i

return d[len1, len2]

def __call__(self, s1, s2):
s1, s2 = self._get_sequences(s1, s2)

result = self.quick_answer(s1, s2)
if result is not None:
return result

return self._pure_python(s1, s2)


DamerauLevenshtein = DamerauLevenshteinUnrestricted


class JaroWinkler(_BaseSimilarity):
"""
Computes the Jaro-Winkler measure between two strings.
Expand Down
12 changes: 12 additions & 0 deletions textdistance/libraries.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,19 @@
[
"jellyfish",
"damerau_levenshtein_distance"
]
],
"DamerauLevenshteinUnrestricted": [
[
"rapidfuzz.distance.DamerauLevenshtein",
"distance"
],
[
"jellyfish",
"damerau_levenshtein_distance"
]
],
"DamerauLevenshteinRestricted": [
[
"pyxdameraulevenshtein",
"damerau_levenshtein_distance"
Expand Down
5 changes: 4 additions & 1 deletion textdistance/libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,11 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):

prototype = LibrariesManager()

prototype.register('DamerauLevenshteinUnrestricted', LibraryBase('abydos.distance', 'DamerauLevenshtein'))
prototype.register('DamerauLevenshteinUnrestricted', TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
prototype.register('DamerauLevenshteinUnrestricted', LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance'))
prototype.register('DamerauLevenshteinRestricted', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
prototype.register('DamerauLevenshtein', LibraryBase('abydos.distance', 'DamerauLevenshtein'))
prototype.register('DamerauLevenshtein', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
prototype.register('DamerauLevenshtein', TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
prototype.register('DamerauLevenshtein', LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance'))

Expand Down

0 comments on commit 742edf5

Please sign in to comment.