Skip to content

Commit

Permalink
improve handling of None/nan
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Dec 4, 2022
1 parent 6e0a7bb commit 3f83cba
Show file tree
Hide file tree
Showing 32 changed files with 483 additions and 255 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
## Changelog

### [2.14.0] -
#### Changed
- handle `float("nan")` similar to None for query / choice, since this is common for
non-existent data in tools like numpy

#### Fixed
- fix handling on `None`/`float("nan")` in `process.distance`

### [2.13.3] - 2022-12-03
#### Fixed
- improve handling of functions wrapped using `functools.wraps`
Expand Down
18 changes: 17 additions & 1 deletion src/rapidfuzz/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from enum import Flag
from typing import Any, Callable
from math import isnan


class ScorerFlag(Flag):
Expand All @@ -29,19 +30,33 @@ def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]:
}


def is_none(s: Any) -> bool:
if s is None:
return True

if isinstance(s, float) and isnan(s):
return True

return False


def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]:
return {"optimal_score": 0, "worst_score": 1, "flags": ScorerFlag.RESULT_F64}


def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]:
return {"optimal_score": 1, "worst_score": 0, "flags": ScorerFlag.RESULT_F64}

def _create_scorer(func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]]):

def _create_scorer(
func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]]
):
func._RF_ScorerPy = cached_scorer_call
# used to detect the function hasn't been wrapped afterwards
func._RF_OriginalScorer = func
return func


def fallback_import(
module: str,
name: str,
Expand Down Expand Up @@ -93,6 +108,7 @@ def fallback_import(

return cpp_func


default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_distance
}
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/DamerauLevenshtein_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

from typing import Callable, Hashable, Sequence
from rapidfuzz._utils import is_none


def _damerau_levenshtein_distance_zhao(
Expand Down Expand Up @@ -174,6 +175,9 @@ def normalized_distance(
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -215,6 +219,9 @@ def normalized_similarity(
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/Hamming_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Hashable, Sequence

from rapidfuzz.distance._initialize import Editops, Opcodes
from rapidfuzz._utils import is_none


def distance(
Expand Down Expand Up @@ -144,6 +145,9 @@ def normalized_distance(
ValueError
If s1 and s2 have a different length
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -191,6 +195,9 @@ def normalized_similarity(
ValueError
If s1 and s2 have a different length
"""
if is_none(s1) or is_none(s2):
return 0.0

norm_dist = normalized_distance(s1, s2, processor=processor)
norm_sim = 1 - norm_dist

Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/Indel_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from rapidfuzz.distance._initialize import Editops, Opcodes
from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity
from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity
from rapidfuzz._utils import is_none


def distance(
Expand Down Expand Up @@ -152,6 +153,9 @@ def normalized_distance(
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -224,6 +228,9 @@ def normalized_similarity(
>>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.8571428571428572
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/JaroWinkler_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Hashable, Sequence

from rapidfuzz.distance import Jaro
from rapidfuzz._utils import is_none


def similarity(
Expand Down Expand Up @@ -46,6 +47,9 @@ def similarity(
ValueError
If prefix_weight is invalid
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -167,6 +171,9 @@ def distance(
ValueError
If prefix_weight is invalid
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/Jaro_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

from typing import Callable, Hashable, Sequence
from rapidfuzz._utils import is_none


def _jaro_calculate_similarity(
Expand Down Expand Up @@ -95,6 +96,9 @@ def similarity(
similarity : float
similarity between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -209,6 +213,9 @@ def distance(
distance : float
distance between s1 and s2 as a float between 1.0 and 0.0
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/LCSseq_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Hashable, Sequence

from rapidfuzz.distance._initialize import Editops, Opcodes
from rapidfuzz._utils import is_none


def similarity(
Expand Down Expand Up @@ -173,6 +174,9 @@ def normalized_distance(
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -235,6 +239,9 @@ def normalized_similarity(
>>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.81818181818181
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/Levenshtein_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from rapidfuzz.distance import Indel
from rapidfuzz.distance._initialize import Editops, Opcodes
from rapidfuzz._utils import is_none


def _levenshtein_maximum(
Expand Down Expand Up @@ -268,6 +269,9 @@ def normalized_distance(
ValueError
If unsupported weights are provided a ValueError is thrown
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -350,6 +354,9 @@ def normalized_similarity(
>>> Levenshtein.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.81818181818181
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
8 changes: 8 additions & 0 deletions src/rapidfuzz/distance/OSA_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from typing import Callable, Hashable, Sequence

from rapidfuzz._utils import is_none


def _osa_distance_hyrroe2003(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
if not s1:
Expand Down Expand Up @@ -170,6 +172,9 @@ def normalized_distance(
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 1.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down Expand Up @@ -211,6 +216,9 @@ def normalized_similarity(
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
8 changes: 8 additions & 0 deletions src/rapidfuzz/distance/Postfix_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from typing import Callable, Hashable, Sequence

from rapidfuzz._utils import is_none


def distance(
s1: Sequence[Hashable],
Expand Down Expand Up @@ -123,6 +125,9 @@ def normalized_distance(
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 1.0

norm_sim = normalized_similarity(s1, s2, processor=processor)
norm_dist = 1.0 - norm_sim

Expand Down Expand Up @@ -160,6 +165,9 @@ def normalized_similarity(
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
7 changes: 7 additions & 0 deletions src/rapidfuzz/distance/Prefix_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

from typing import Callable, Hashable, Sequence
from rapidfuzz._utils import is_none


def distance(
Expand Down Expand Up @@ -123,6 +124,9 @@ def normalized_distance(
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 1.0

norm_sim = normalized_similarity(s1, s2, processor=processor)
norm_dist = 1.0 - norm_sim

Expand Down Expand Up @@ -160,6 +164,9 @@ def normalized_similarity(
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
if is_none(s1) or is_none(s2):
return 0.0

if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
Expand Down
Loading

0 comments on commit 3f83cba

Please sign in to comment.