From 3f83cba3cef63e2263b047d378923402202080b9 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 4 Dec 2022 23:30:11 +0100 Subject: [PATCH] improve handling of None/nan --- CHANGELOG.md | 8 + src/rapidfuzz/_utils.py | 18 ++- .../distance/DamerauLevenshtein_py.py | 7 + src/rapidfuzz/distance/Hamming_py.py | 7 + src/rapidfuzz/distance/Indel_py.py | 7 + src/rapidfuzz/distance/JaroWinkler_py.py | 7 + src/rapidfuzz/distance/Jaro_py.py | 7 + src/rapidfuzz/distance/LCSseq_py.py | 7 + src/rapidfuzz/distance/Levenshtein_py.py | 7 + src/rapidfuzz/distance/OSA_py.py | 8 + src/rapidfuzz/distance/Postfix_py.py | 8 + src/rapidfuzz/distance/Prefix_py.py | 7 + src/rapidfuzz/distance/metrics_cpp.pyx | 129 +++++++--------- src/rapidfuzz/fuzz_cpp.pyx | 32 ++-- src/rapidfuzz/fuzz_py.py | 33 ++-- src/rapidfuzz/process_cpp_impl.pyx | 65 ++++---- src/rapidfuzz/process_py.py | 26 +++- tests/common.py | 63 +++++--- tests/distance/common.py | 146 ++++++++++++++++++ tests/distance/test_DamerauLevenshtein.py | 12 +- tests/distance/test_Hamming.py | 10 +- tests/distance/test_Indel.py | 10 +- tests/distance/test_Jaro.py | 11 +- tests/distance/test_JaroWinkler.py | 11 +- tests/distance/test_LCSseq.py | 10 +- tests/distance/test_Levenshtein.py | 21 +-- tests/distance/test_OSA.py | 9 +- tests/distance/test_Postfix.py | 10 +- tests/distance/test_Prefix.py | 10 +- tests/distance/test_distance.py | 20 +++ tests/test_fuzz.py | 9 ++ tests/test_process.py | 3 + 32 files changed, 483 insertions(+), 255 deletions(-) create mode 100644 tests/distance/common.py create mode 100644 tests/distance/test_distance.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d520c432..7ab23bc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ ## Changelog +### [2.14.0] - +#### Changed +- handle `float("nan")` similar to None for query / choice, since this is common for + non-existent data in tools like numpy + +#### Fixed +- fix handling on `None`/`float("nan")` in `process.distance` + ### [2.13.3] - 2022-12-03 #### Fixed - improve handling of functions wrapped using `functools.wraps` diff --git a/src/rapidfuzz/_utils.py b/src/rapidfuzz/_utils.py index d4390a52..b5c9f781 100644 --- a/src/rapidfuzz/_utils.py +++ b/src/rapidfuzz/_utils.py @@ -5,6 +5,7 @@ from enum import Flag from typing import Any, Callable +from math import isnan class ScorerFlag(Flag): @@ -29,6 +30,16 @@ def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]: } +def is_none(s: Any) -> bool: + if s is None: + return True + + if isinstance(s, float) and isnan(s): + return True + + return False + + def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]: return {"optimal_score": 0, "worst_score": 1, "flags": ScorerFlag.RESULT_F64} @@ -36,12 +47,16 @@ def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]: def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]: return {"optimal_score": 1, "worst_score": 0, "flags": ScorerFlag.RESULT_F64} -def _create_scorer(func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]]): + +def _create_scorer( + func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]] +): func._RF_ScorerPy = cached_scorer_call # used to detect the function hasn't been wrapped afterwards func._RF_OriginalScorer = func return func + def fallback_import( module: str, name: str, @@ -93,6 +108,7 @@ def fallback_import( return cpp_func + default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = { "get_scorer_flags": _get_scorer_flags_distance } diff --git a/src/rapidfuzz/distance/DamerauLevenshtein_py.py b/src/rapidfuzz/distance/DamerauLevenshtein_py.py index 737490e2..fdf77145 100644 --- a/src/rapidfuzz/distance/DamerauLevenshtein_py.py +++ b/src/rapidfuzz/distance/DamerauLevenshtein_py.py @@ -4,6 +4,7 @@ from __future__ import annotations from typing import Callable, Hashable, Sequence +from rapidfuzz._utils import is_none def _damerau_levenshtein_distance_zhao( @@ -174,6 +175,9 @@ def normalized_distance( norm_dist : float normalized distance between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -215,6 +219,9 @@ def normalized_similarity( norm_sim : float normalized similarity between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/Hamming_py.py b/src/rapidfuzz/distance/Hamming_py.py index e4b30f31..553b3e55 100644 --- a/src/rapidfuzz/distance/Hamming_py.py +++ b/src/rapidfuzz/distance/Hamming_py.py @@ -6,6 +6,7 @@ from typing import Callable, Hashable, Sequence from rapidfuzz.distance._initialize import Editops, Opcodes +from rapidfuzz._utils import is_none def distance( @@ -144,6 +145,9 @@ def normalized_distance( ValueError If s1 and s2 have a different length """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -191,6 +195,9 @@ def normalized_similarity( ValueError If s1 and s2 have a different length """ + if is_none(s1) or is_none(s2): + return 0.0 + norm_dist = normalized_distance(s1, s2, processor=processor) norm_sim = 1 - norm_dist diff --git a/src/rapidfuzz/distance/Indel_py.py b/src/rapidfuzz/distance/Indel_py.py index a4e1c5a6..05f09eba 100644 --- a/src/rapidfuzz/distance/Indel_py.py +++ b/src/rapidfuzz/distance/Indel_py.py @@ -8,6 +8,7 @@ from rapidfuzz.distance._initialize import Editops, Opcodes from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity +from rapidfuzz._utils import is_none def distance( @@ -152,6 +153,9 @@ def normalized_distance( norm_dist : float normalized distance between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -224,6 +228,9 @@ def normalized_similarity( >>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0]) 0.8571428571428572 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/JaroWinkler_py.py b/src/rapidfuzz/distance/JaroWinkler_py.py index e9eef340..262e1964 100644 --- a/src/rapidfuzz/distance/JaroWinkler_py.py +++ b/src/rapidfuzz/distance/JaroWinkler_py.py @@ -6,6 +6,7 @@ from typing import Callable, Hashable, Sequence from rapidfuzz.distance import Jaro +from rapidfuzz._utils import is_none def similarity( @@ -46,6 +47,9 @@ def similarity( ValueError If prefix_weight is invalid """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -167,6 +171,9 @@ def distance( ValueError If prefix_weight is invalid """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/Jaro_py.py b/src/rapidfuzz/distance/Jaro_py.py index 780aebfd..07035af8 100644 --- a/src/rapidfuzz/distance/Jaro_py.py +++ b/src/rapidfuzz/distance/Jaro_py.py @@ -4,6 +4,7 @@ from __future__ import annotations from typing import Callable, Hashable, Sequence +from rapidfuzz._utils import is_none def _jaro_calculate_similarity( @@ -95,6 +96,9 @@ def similarity( similarity : float similarity between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -209,6 +213,9 @@ def distance( distance : float distance between s1 and s2 as a float between 1.0 and 0.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/LCSseq_py.py b/src/rapidfuzz/distance/LCSseq_py.py index 1c28b5ab..72a06fcd 100644 --- a/src/rapidfuzz/distance/LCSseq_py.py +++ b/src/rapidfuzz/distance/LCSseq_py.py @@ -6,6 +6,7 @@ from typing import Callable, Hashable, Sequence from rapidfuzz.distance._initialize import Editops, Opcodes +from rapidfuzz._utils import is_none def similarity( @@ -173,6 +174,9 @@ def normalized_distance( norm_dist : float normalized distance between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -235,6 +239,9 @@ def normalized_similarity( >>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0]) 0.81818181818181 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/Levenshtein_py.py b/src/rapidfuzz/distance/Levenshtein_py.py index abdbee19..d9bf1c60 100644 --- a/src/rapidfuzz/distance/Levenshtein_py.py +++ b/src/rapidfuzz/distance/Levenshtein_py.py @@ -7,6 +7,7 @@ from rapidfuzz.distance import Indel from rapidfuzz.distance._initialize import Editops, Opcodes +from rapidfuzz._utils import is_none def _levenshtein_maximum( @@ -268,6 +269,9 @@ def normalized_distance( ValueError If unsupported weights are provided a ValueError is thrown """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -350,6 +354,9 @@ def normalized_similarity( >>> Levenshtein.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0]) 0.81818181818181 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/OSA_py.py b/src/rapidfuzz/distance/OSA_py.py index 18d9f9dd..aeea716f 100644 --- a/src/rapidfuzz/distance/OSA_py.py +++ b/src/rapidfuzz/distance/OSA_py.py @@ -5,6 +5,8 @@ from typing import Callable, Hashable, Sequence +from rapidfuzz._utils import is_none + def _osa_distance_hyrroe2003(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: if not s1: @@ -170,6 +172,9 @@ def normalized_distance( norm_dist : float normalized distance between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -211,6 +216,9 @@ def normalized_similarity( norm_sim : float normalized similarity between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/Postfix_py.py b/src/rapidfuzz/distance/Postfix_py.py index 6483d1e5..5307bdaa 100644 --- a/src/rapidfuzz/distance/Postfix_py.py +++ b/src/rapidfuzz/distance/Postfix_py.py @@ -5,6 +5,8 @@ from typing import Callable, Hashable, Sequence +from rapidfuzz._utils import is_none + def distance( s1: Sequence[Hashable], @@ -123,6 +125,9 @@ def normalized_distance( norm_dist : float normalized distance between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + norm_sim = normalized_similarity(s1, s2, processor=processor) norm_dist = 1.0 - norm_sim @@ -160,6 +165,9 @@ def normalized_similarity( norm_sim : float normalized similarity between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/Prefix_py.py b/src/rapidfuzz/distance/Prefix_py.py index 29bbf62e..edbfd4fc 100644 --- a/src/rapidfuzz/distance/Prefix_py.py +++ b/src/rapidfuzz/distance/Prefix_py.py @@ -4,6 +4,7 @@ from __future__ import annotations from typing import Callable, Hashable, Sequence +from rapidfuzz._utils import is_none def distance( @@ -123,6 +124,9 @@ def normalized_distance( norm_dist : float normalized distance between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 1.0 + norm_sim = normalized_similarity(s1, s2, processor=processor) norm_dist = 1.0 - norm_sim @@ -160,6 +164,9 @@ def normalized_similarity( norm_sim : float normalized similarity between s1 and s2 as a float between 0 and 1.0 """ + if is_none(s1) or is_none(s2): + return 0.0 + if processor is not None: s1 = processor(s1) s2 = processor(s2) diff --git a/src/rapidfuzz/distance/metrics_cpp.pyx b/src/rapidfuzz/distance/metrics_cpp.pyx index 97c80558..e61502ee 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyx +++ b/src/rapidfuzz/distance/metrics_cpp.pyx @@ -32,6 +32,7 @@ from cpp_common cimport ( from cpython.pycapsule cimport PyCapsule_New from libc.stdint cimport INT64_MAX, int64_t from libc.stdlib cimport free, malloc +from libc.math cimport isnan from libcpp cimport bool @@ -168,6 +169,14 @@ cdef extern from "metrics.hpp": bool PostfixSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) nogil except False bool PostfixNormalizedSimilarityInit(RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) nogil except False +cdef inline bool is_none(s): + if s is None: + return True + + if isinstance(s, float) and isnan(s): + return True + + return False cdef int64_t get_score_cutoff_i64(score_cutoff, int64_t default) except -1: cdef int64_t c_score_cutoff = default @@ -231,8 +240,8 @@ def levenshtein_similarity(s1, s2, *, weights=(1,1,1), processor=None, score_cut def levenshtein_normalized_distance(s1, s2, *, weights=(1,1,1), processor=None, score_cutoff=None, score_hint=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef int64_t insertion, deletion, substitution insertion = deletion = substitution = 1 @@ -247,8 +256,8 @@ def levenshtein_normalized_distance(s1, s2, *, weights=(1,1,1), processor=None, def levenshtein_normalized_similarity(s1, s2, *, weights=(1,1,1), processor=None, score_cutoff=None, score_hint=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef int64_t insertion, deletion, substitution insertion = deletion = substitution = 1 @@ -375,8 +384,8 @@ def damerau_levenshtein_similarity(s1, s2, *, processor=None, score_cutoff=None) def damerau_levenshtein_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -385,8 +394,8 @@ def damerau_levenshtein_normalized_distance(s1, s2, *, processor=None, score_cut def damerau_levenshtein_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -445,8 +454,8 @@ def lcs_seq_similarity(s1, s2, *, processor=None, score_cutoff=None): def lcs_seq_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -455,8 +464,8 @@ def lcs_seq_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def lcs_seq_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -546,8 +555,8 @@ def indel_similarity(s1, s2, *, processor=None, score_cutoff=None): def indel_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -556,8 +565,8 @@ def indel_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def indel_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -638,9 +647,6 @@ def hamming_distance(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, INT64_MAX) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return hamming_distance_func(s1_proc.string, s2_proc.string, c_score_cutoff) @@ -648,16 +654,13 @@ def hamming_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, 0) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return hamming_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) def hamming_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -666,8 +669,8 @@ def hamming_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def hamming_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -732,9 +735,6 @@ def osa_distance(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, INT64_MAX) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return osa_distance_func(s1_proc.string, s2_proc.string, c_score_cutoff) @@ -742,16 +742,13 @@ def osa_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, 0) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return osa_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) def osa_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -760,8 +757,8 @@ def osa_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def osa_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -822,8 +819,8 @@ osa_normalized_similarity._RF_Scorer = PyCapsule_New(&OSANormalizedSimilarityCon def jaro_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -831,8 +828,8 @@ def jaro_distance(s1, s2, *, processor=None, score_cutoff=None): def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -840,8 +837,8 @@ def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None): def jaro_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -850,8 +847,8 @@ def jaro_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def jaro_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -884,8 +881,8 @@ jaro_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroSimilarityContext, NU def jaro_winkler_distance(s1, s2, *, double prefix_weight=0.1, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -893,8 +890,8 @@ def jaro_winkler_distance(s1, s2, *, double prefix_weight=0.1, processor=None, s def jaro_winkler_similarity(s1, s2, *, double prefix_weight=0.1, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -902,8 +899,8 @@ def jaro_winkler_similarity(s1, s2, *, double prefix_weight=0.1, processor=None, def jaro_winkler_normalized_distance(s1, s2, *, double prefix_weight=0.1, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -911,8 +908,8 @@ def jaro_winkler_normalized_distance(s1, s2, *, double prefix_weight=0.1, proces def jaro_winkler_normalized_similarity(s1, s2, *, double prefix_weight=0.1, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -957,9 +954,6 @@ def postfix_distance(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, INT64_MAX) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return postfix_distance_func(s1_proc.string, s2_proc.string, c_score_cutoff) @@ -967,16 +961,13 @@ def postfix_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, 0) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return postfix_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) def postfix_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -985,8 +976,8 @@ def postfix_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def postfix_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -1037,9 +1028,6 @@ def prefix_distance(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, INT64_MAX) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return prefix_distance_func(s1_proc.string, s2_proc.string, c_score_cutoff) @@ -1047,16 +1035,13 @@ def prefix_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef int64_t c_score_cutoff = get_score_cutoff_i64(score_cutoff, 0) cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) return prefix_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) def prefix_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 1.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) @@ -1065,8 +1050,8 @@ def prefix_normalized_distance(s1, s2, *, processor=None, score_cutoff=None): def prefix_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: - return 0 + if is_none(s1) or is_none(s2): + return 0.0 cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0) preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, None) diff --git a/src/rapidfuzz/fuzz_cpp.pyx b/src/rapidfuzz/fuzz_cpp.pyx index 7c00a346..f51f7aef 100644 --- a/src/rapidfuzz/fuzz_cpp.pyx +++ b/src/rapidfuzz/fuzz_cpp.pyx @@ -31,6 +31,7 @@ from cpp_common cimport ( preprocess_strings, ) from libc.stdint cimport int64_t, uint32_t +from libc.math cimport isnan from libcpp cimport bool from array import array @@ -63,11 +64,20 @@ cdef extern from "fuzz_cpp.hpp": bool RatioMultiStringSupport(const RF_Kwargs*) nogil +cdef inline bool is_none(s): + if s is None: + return True + + if isinstance(s, float) and isnan(s): + return True + + return False + def ratio(s1, s2, *, processor=None, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -78,7 +88,7 @@ def partial_ratio(s1, s2, *, processor=None, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -89,7 +99,7 @@ def partial_ratio_alignment(s1, s2, *, processor=None, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return None preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -105,7 +115,7 @@ def token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -116,7 +126,7 @@ def token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -127,7 +137,7 @@ def token_ratio(s1, s2, *, processor=default_process, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -138,7 +148,7 @@ def partial_token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff= cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -149,7 +159,7 @@ def partial_token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=N cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -163,7 +173,7 @@ def partial_token_ratio(s1, s2, *, processor=default_process, score_cutoff=None) cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc, default_process) @@ -174,7 +184,7 @@ def WRatio(s1, s2, *, processor=default_process, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -187,7 +197,7 @@ def QRatio(s1, s2, *, processor=default_process, score_cutoff=None): cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff cdef RF_StringWrapper s1_proc, s2_proc - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: diff --git a/src/rapidfuzz/fuzz_py.py b/src/rapidfuzz/fuzz_py.py index 7df641eb..7dd7ac5e 100644 --- a/src/rapidfuzz/fuzz_py.py +++ b/src/rapidfuzz/fuzz_py.py @@ -2,8 +2,8 @@ # Copyright (C) 2022 Max Bachmann from __future__ import annotations -from math import ceil -from typing import Callable, Hashable +from math import ceil, isnan +from typing import Callable, Hashable, Any from rapidfuzz.distance import ScoreAlignment from rapidfuzz.distance.Indel_py import ( @@ -14,6 +14,7 @@ normalized_similarity as indel_normalized_similarity, ) from rapidfuzz.utils_py import default_process +from rapidfuzz._utils import is_none def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float: @@ -63,7 +64,7 @@ def ratio( >>> fuzz.ratio("this is a test", "this is a test!") 96.55171966552734 """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -226,10 +227,12 @@ def partial_ratio( >>> fuzz.partial_ratio("this is a test", "this is a test!") 100.0 """ - alignment = partial_ratio_alignment(s1, s2, processor=processor, score_cutoff=score_cutoff) + alignment = partial_ratio_alignment( + s1, s2, processor=processor, score_cutoff=score_cutoff + ) if alignment is None: return 0 - + return alignment.score @@ -277,7 +280,7 @@ def partial_ratio_alignment( >>> fuzz.ratio(s1[res.src_start:res.src_end], s2[res.dest_start:res.dest_end]) 83.33333333333334 """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return None if processor is True: @@ -310,7 +313,7 @@ def partial_ratio_alignment( res = ScoreAlignment( res2.score, res2.dest_start, res2.dest_end, res2.src_start, res2.src_end ) - + if res.score < score_cutoff: return None @@ -360,7 +363,7 @@ def token_sort_ratio( >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") 100.0 """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -418,7 +421,7 @@ def token_set_ratio( >>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") 100.0 """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -517,7 +520,7 @@ def token_ratio( ----- .. image:: img/token_ratio.svg """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -569,7 +572,7 @@ def partial_token_sort_ratio( ----- .. image:: img/partial_token_sort_ratio.svg """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -620,7 +623,7 @@ def partial_token_set_ratio( ----- .. image:: img/partial_token_set_ratio.svg """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -682,7 +685,7 @@ def partial_token_ratio( ----- .. image:: img/partial_token_ratio.svg """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: @@ -763,7 +766,7 @@ def WRatio( ----- .. image:: img/WRatio.svg """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 UNBASE_SCALE = 0.95 @@ -849,7 +852,7 @@ def QRatio( >>> fuzz.QRatio("this is a test", "THIS is a test!") 100.0 """ - if s1 is None or s2 is None: + if is_none(s1) or is_none(s2): return 0 if processor is True: diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index eaee5cf7..587ede8f 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -19,7 +19,7 @@ from cpython.exc cimport PyErr_CheckSignals from cpython.list cimport PyList_New, PyList_SET_ITEM from cpython.object cimport PyObject from cpython.ref cimport Py_INCREF -from libc.math cimport floor +from libc.math cimport floor, isnan from libc.stdint cimport int32_t, int64_t, uint8_t, uint64_t from libcpp cimport algorithm, bool from libcpp.utility cimport move @@ -128,6 +128,15 @@ cdef extern from "process_cpp.hpp": RfMatrix cdist_two_lists_impl[T]( const RF_ScorerFlags* scorer_flags, const RF_Kwargs*, RF_Scorer*, const vector[RF_StringWrapper]&, const vector[RF_StringWrapper]&, MatrixType, int, T, T) except + +cdef inline bool is_none(s): + if s is None: + return True + + if isinstance(s, float) and isnan(s): + return True + + return False + cdef inline vector[DictStringElem] preprocess_dict(queries, processor) except *: cdef vector[DictStringElem] proc_queries cdef int64_t queries_len = len(queries) @@ -139,7 +148,7 @@ cdef inline vector[DictStringElem] preprocess_dict(queries, processor) except *: # No processor if not processor: for i, (query_key, query) in enumerate(queries.items()): - if query is None: + if is_none(query): continue proc_queries.emplace_back( i, @@ -155,7 +164,7 @@ cdef inline vector[DictStringElem] preprocess_dict(queries, processor) except *: # use RapidFuzz C-Api if processor_context != NULL and processor_context.version == SCORER_STRUCT_VERSION: for i, (query_key, query) in enumerate(queries.items()): - if query is None: + if is_none(query): continue processor_context.preprocess(query, &proc_str) proc_queries.emplace_back( @@ -168,7 +177,7 @@ cdef inline vector[DictStringElem] preprocess_dict(queries, processor) except *: # Call Processor through Python else: for i, (query_key, query) in enumerate(queries.items()): - if query is None: + if is_none(query): continue proc_query = processor(query) proc_queries.emplace_back( @@ -191,7 +200,7 @@ cdef inline vector[ListStringElem] preprocess_list(queries, processor) except *: # No processor if not processor: for i, query in enumerate(queries): - if query is None: + if is_none(query): continue proc_queries.emplace_back( i, @@ -206,7 +215,7 @@ cdef inline vector[ListStringElem] preprocess_list(queries, processor) except *: # use RapidFuzz C-Api if processor_context != NULL and processor_context.version == SCORER_STRUCT_VERSION: for i, query in enumerate(queries): - if query is None: + if is_none(query): continue processor_context.preprocess(query, &proc_str) proc_queries.emplace_back( @@ -218,7 +227,7 @@ cdef inline vector[ListStringElem] preprocess_list(queries, processor) except *: # Call Processor through Python else: for i, query in enumerate(queries): - if query is None: + if is_none(query): continue proc_query = processor(query) proc_queries.emplace_back( @@ -265,7 +274,7 @@ cdef inline extractOne_dict_f64( if i % 1000 == 0: PyErr_CheckSignals() i += 1 - if choice is None: + if is_none(choice): continue if processor is None: @@ -336,7 +345,7 @@ cdef inline extractOne_dict_i64( if i % 1000 == 0: PyErr_CheckSignals() i += 1 - if choice is None: + if is_none(choice): continue if processor is None: @@ -428,7 +437,7 @@ cdef inline extractOne_list_f64( for i, choice in enumerate(choices): if i % 1000 == 0: PyErr_CheckSignals() - if choice is None: + if is_none(choice): continue if processor is None: @@ -497,7 +506,7 @@ cdef inline extractOne_list_i64( for i, choice in enumerate(choices): if i % 1000 == 0: PyErr_CheckSignals() - if choice is None: + if is_none(choice): continue if processor is None: @@ -567,7 +576,7 @@ cdef inline py_extractOne_dict(query, choices, scorer, processor, double score_c result_key = None for choice_key, choice in choices.items(): - if choice is None: + if is_none(choice): continue if processor is not None: @@ -607,7 +616,7 @@ cdef inline py_extractOne_list(query, choices, scorer, processor, double score_c result_choice = None for i, choice in enumerate(choices): - if choice is None: + if is_none(choice): continue if processor is not None: @@ -642,7 +651,7 @@ def extractOne(query, choices, *, scorer=WRatio, processor=default_process, scor cdef RF_Scorer* scorer_context = NULL cdef RF_ScorerFlags scorer_flags - if query is None: + if is_none(query): return None if processor is True: @@ -897,7 +906,7 @@ cdef inline py_extract_dict(query, choices, scorer, processor, int64_t limit, do cdef list result_list = [] for choice_key, choice in choices.items(): - if choice is None: + if is_none(choice): continue if processor is not None: @@ -925,7 +934,7 @@ cdef inline py_extract_list(query, choices, scorer, processor, int64_t limit, do cdef int64_t i for i, choice in enumerate(choices): - if choice is None: + if is_none(choice): continue if processor is not None: @@ -950,7 +959,7 @@ def extract(query, choices, *, scorer=WRatio, processor=default_process, limit=5 cdef RF_Scorer* scorer_context = NULL cdef RF_ScorerFlags scorer_flags - if query is None: + if is_none(query): return [] if processor is True: @@ -1021,7 +1030,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc cdef double score for choice_key, choice in choices.items(): - if choice is None: + if is_none(choice): continue # use RapidFuzz C-Api @@ -1030,7 +1039,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc choice_proc = RF_StringWrapper(proc_str) elif processor is not None: proc_choice = processor(choice) - if proc_choice is None: + if is_none(proc_choice): continue choice_proc = RF_StringWrapper(conv_sequence(proc_choice)) @@ -1064,7 +1073,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc cdef int64_t score for choice_key, choice in choices.items(): - if choice is None: + if is_none(choice): continue # use RapidFuzz C-Api @@ -1073,7 +1082,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc choice_proc = RF_StringWrapper(proc_str) elif processor is not None: proc_choice = processor(choice) - if proc_choice is None: + if is_none(proc_choice): continue choice_proc = RF_StringWrapper(conv_sequence(proc_choice)) @@ -1107,7 +1116,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc cdef double score for i, choice in enumerate(choices): - if choice is None: + if is_none(choice): continue # use RapidFuzz C-Api @@ -1116,7 +1125,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc choice_proc = RF_StringWrapper(proc_str) elif processor is not None: proc_choice = processor(choice) - if proc_choice is None: + if is_none(proc_choice): continue choice_proc = RF_StringWrapper(conv_sequence(proc_choice)) @@ -1150,7 +1159,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc cdef int64_t score for i, choice in enumerate(choices): - if choice is None: + if is_none(choice): continue # use RapidFuzz C-Api @@ -1159,7 +1168,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc choice_proc = RF_StringWrapper(proc_str) elif processor is not None: proc_choice = processor(choice) - if proc_choice is None: + if is_none(proc_choice): continue choice_proc = RF_StringWrapper(conv_sequence(proc_choice)) @@ -1183,7 +1192,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc cdef bool lowest_score_worst = optimal_score > worst_score for choice_key, choice in choices.items(): - if choice is None: + if is_none(choice): continue if processor is not None: @@ -1208,7 +1217,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc cdef int64_t i for i, choice in enumerate(choices): - if choice is None: + if is_none(choice): continue if processor is not None: @@ -1223,7 +1232,7 @@ def extract_iter(query, choices, *, scorer=WRatio, processor=default_process, sc if score <= score_cutoff: yield (choice, score, i) - if query is None: + if is_none(query): # finish generator return diff --git a/src/rapidfuzz/process_py.py b/src/rapidfuzz/process_py.py index 1b813de6..79c1a743 100644 --- a/src/rapidfuzz/process_py.py +++ b/src/rapidfuzz/process_py.py @@ -19,6 +19,7 @@ from rapidfuzz._utils import ScorerFlag from rapidfuzz.fuzz import WRatio, ratio from rapidfuzz.utils import default_process +from math import isnan __all__ = ["extract", "extract_iter", "extractOne", "cdist"] @@ -31,6 +32,16 @@ def _get_scorer_flags_py(scorer: Any, kwargs: dict[str, Any]) -> tuple[int, int] return (0, 100) +def _is_none(s: Any) -> bool: + if s is None: + return True + + if isinstance(s, float) and isnan(s): + return True + + return False + + @overload def extract_iter( query: Sequence[Hashable] | None, @@ -130,7 +141,7 @@ def extract_iter( worst_score, optimal_score = _get_scorer_flags_py(scorer, kwargs) lowest_score_worst = optimal_score > worst_score - if query is None: + if _is_none(query): return if processor is True: @@ -148,7 +159,7 @@ def extract_iter( choices_iter: Iterable[tuple[Any, Sequence[Hashable] | None]] choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices) # type: ignore[union-attr] for key, choice in choices_iter: - if choice is None: + if _is_none(choice): continue if processor is None: @@ -334,7 +345,7 @@ def extractOne( worst_score, optimal_score = _get_scorer_flags_py(scorer, kwargs) lowest_score_worst = optimal_score > worst_score - if query is None: + if _is_none(query): return None if processor is True: @@ -354,7 +365,7 @@ def extractOne( choices_iter: Iterable[tuple[Any, Sequence[Hashable] | None]] choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices) # type: ignore[union-attr] for key, choice in choices_iter: - if choice is None: + if _is_none(choice): continue if processor is None: @@ -497,7 +508,12 @@ def extract( limit = len(choices) result_iter = extract_iter( - query, choices, processor=processor, scorer=scorer, score_cutoff=score_cutoff, **kwargs + query, + choices, + processor=processor, + scorer=scorer, + score_cutoff=score_cutoff, + **kwargs, ) if lowest_score_worst: return heapq.nlargest(limit, result_iter, key=lambda i: i[1]) diff --git a/tests/common.py b/tests/common.py index c2501d01..16feeafe 100644 --- a/tests/common.py +++ b/tests/common.py @@ -6,13 +6,22 @@ from rapidfuzz import process_cpp, process_py from rapidfuzz import utils +from math import isnan + + +def is_none(s): + if s is None: + return True + + if isinstance(s, float) and isnan(s): + return True + + return False + def scorer_tester(scorer, s1, s2, **kwargs): score1 = scorer(s1, s2, **kwargs) - if s1 is None or s2 is None: - return score1 - if "processor" not in kwargs: kwargs["processor"] = None elif kwargs["processor"] is True: @@ -20,23 +29,37 @@ def scorer_tester(scorer, s1, s2, **kwargs): elif kwargs["processor"] is False: kwargs["processor"] = None + extractOne_res1 = process_cpp.extractOne(s1, [s2], scorer=scorer, **kwargs) + extractOne_res2 = process_py.extractOne(s1, [s2], scorer=scorer, **kwargs) + extract_res1 = process_cpp.extract(s1, [s2], scorer=scorer, **kwargs) + extract_res2 = process_py.extract(s1, [s2], scorer=scorer, **kwargs) + + if is_none(s1) or is_none(s2): + assert extractOne_res1 is None + assert extractOne_res2 is None + assert extract_res1 == [] + assert extract_res2 == [] # todo add testing with score_cutoff # this is a bit harder, since result elements are filtererd out # if they are worse than score_cutoff - if kwargs.get("score_cutoff") is None: - score2 = process_cpp.extractOne(s1, [s2], scorer=scorer, **kwargs)[1] - score3 = process_cpp.extract(s1, [s2], scorer=scorer, **kwargs)[0][1] - score4 = process_py.extractOne(s1, [s2], scorer=scorer, **kwargs)[1] - score5 = process_py.extract(s1, [s2], scorer=scorer, **kwargs)[0][1] - assert pytest.approx(score1) == score2 - assert pytest.approx(score1) == score3 - assert pytest.approx(score1) == score4 - assert pytest.approx(score1) == score5 - - score6 = process_cpp.cdist([s1], [s2], scorer=scorer, **kwargs)[0][0] - score7 = process_py.cdist([s1], [s2], scorer=scorer, **kwargs)[0][0] - assert pytest.approx(score1) == score6 - assert pytest.approx(score1) == score7 + elif kwargs.get("score_cutoff") is not None: + assert extractOne_res1 is None or pytest.approx(score1) == extractOne_res1[1] + assert extractOne_res2 is None or pytest.approx(score1) == extractOne_res2[1] + assert extract_res1 == [] or pytest.approx(score1) == extract_res1[0][1] + assert extract_res2 == [] or pytest.approx(score1) == extract_res2[0][1] + else: + assert pytest.approx(score1) == extractOne_res1[1] + assert pytest.approx(score1) == extractOne_res2[1] + assert pytest.approx(score1) == extract_res1[0][1] + assert pytest.approx(score1) == extract_res2[0][1] + + # todo this should be able to handle None similar to the original scorer + if not is_none(s1) and not is_none(s2): + score6 = process_cpp.cdist([s1], [s2], scorer=scorer, **kwargs)[0][0] + score7 = process_py.cdist([s1], [s2], scorer=scorer, **kwargs)[0][0] + assert pytest.approx(score1) == score6 + assert pytest.approx(score1) == score7 + return score1 @@ -128,9 +151,11 @@ def similarity(self, s1, s2, **kwargs): return self._similarity(s1, s2, **kwargs) def normalized_distance(self, s1, s2, **kwargs): - self._validate(s1, s2, **kwargs) + if not is_none(s1) and not is_none(s2): + self._validate(s1, s2, **kwargs) return self._normalized_distance(s1, s2, **kwargs) def normalized_similarity(self, s1, s2, **kwargs): - self._validate(s1, s2, **kwargs) + if not is_none(s1) and not is_none(s2): + self._validate(s1, s2, **kwargs) return self._normalized_similarity(s1, s2, **kwargs) diff --git a/tests/distance/common.py b/tests/distance/common.py new file mode 100644 index 00000000..4bd47e4e --- /dev/null +++ b/tests/distance/common.py @@ -0,0 +1,146 @@ +from rapidfuzz.distance import ( + DamerauLevenshtein_cpp, + DamerauLevenshtein_py, + Hamming_cpp, + Hamming_py, + Indel_cpp, + Indel_py, + Jaro_cpp, + Jaro_py, + JaroWinkler_cpp, + JaroWinkler_py, + LCSseq_cpp, + LCSseq_py, + Levenshtein_cpp, + Levenshtein_py, + OSA_cpp, + OSA_py, + Postfix_cpp, + Postfix_py, + Prefix_cpp, + Prefix_py, +) +from ..common import GenericScorer, is_none + + +def get_scorer_flags_damerau_levenshtein(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": max(len(s1), len(s2)), "symmetric": True} + + +DamerauLevenshtein = GenericScorer( + DamerauLevenshtein_py, DamerauLevenshtein_cpp, get_scorer_flags_damerau_levenshtein +) + + +def get_scorer_flags_hamming(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": max(len(s1), len(s2)), "symmetric": True} + + +Hamming = GenericScorer(Hamming_py, Hamming_cpp, get_scorer_flags_hamming) + + +def get_scorer_flags_indel(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": len(s1) + len(s2), "symmetric": True} + + +Indel = GenericScorer(Indel_py, Indel_cpp, get_scorer_flags_indel) + + +def get_scorer_flags_jaro(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": 1.0, "symmetric": True} + + +Jaro = GenericScorer(Jaro_py, Jaro_cpp, get_scorer_flags_jaro) + + +def get_scorer_flags_jaro_winkler(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": 1.0, "symmetric": True} + + +JaroWinkler = GenericScorer( + JaroWinkler_py, JaroWinkler_cpp, get_scorer_flags_jaro_winkler +) + + +def get_scorer_flags_lcs_seq(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": max(len(s1), len(s2)), "symmetric": True} + + +LCSseq = GenericScorer(LCSseq_py, LCSseq_cpp, get_scorer_flags_lcs_seq) + + +def get_scorer_flags_levenshtein(s1, s2, weights=(1, 1, 1), **kwargs): + insert_cost, delete_cost, replace_cost = weights + + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": insert_cost == delete_cost} + + max_dist = len(s1) * delete_cost + len(s2) * insert_cost + + if len(s1) >= len(s2): + max_dist = min( + max_dist, len(s2) * replace_cost + (len(s1) - len(s2)) * delete_cost + ) + else: + max_dist = min( + max_dist, len(s1) * replace_cost + (len(s2) - len(s1)) * insert_cost + ) + + return {"maximum": max_dist, "symmetric": insert_cost == delete_cost} + + +Levenshtein = GenericScorer( + Levenshtein_py, Levenshtein_cpp, get_scorer_flags_levenshtein +) + + +def get_scorer_flags_osa(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": max(len(s1), len(s2)), "symmetric": True} + + +OSA = GenericScorer(OSA_py, OSA_cpp, get_scorer_flags_osa) + + +def get_scorer_flags_postfix(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": max(len(s1), len(s2)), "symmetric": True} + + +Postfix = GenericScorer(Postfix_py, Postfix_cpp, get_scorer_flags_postfix) + + +def get_scorer_flags_prefix(s1, s2, **kwargs): + if is_none(s1) or is_none(s2): + return {"maximum": None, "symmetric": True} + return {"maximum": max(len(s1), len(s2)), "symmetric": True} + + +Prefix = GenericScorer(Prefix_py, Prefix_cpp, get_scorer_flags_prefix) + +all_scorer_modules = [ + DamerauLevenshtein, + Hamming, + Indel, + Jaro, + JaroWinkler, + LCSseq, + Levenshtein, + OSA, + Postfix, + Prefix, +] diff --git a/tests/distance/test_DamerauLevenshtein.py b/tests/distance/test_DamerauLevenshtein.py index cb3f8173..98c870a2 100644 --- a/tests/distance/test_DamerauLevenshtein.py +++ b/tests/distance/test_DamerauLevenshtein.py @@ -1,16 +1,6 @@ import pytest -from rapidfuzz.distance import DamerauLevenshtein_cpp, DamerauLevenshtein_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": max(len(s1), len(s2)), "symmetric": True} - - -DamerauLevenshtein = GenericScorer( - DamerauLevenshtein_py, DamerauLevenshtein_cpp, get_scorer_flags -) +from .common import DamerauLevenshtein @pytest.mark.parametrize( diff --git a/tests/distance/test_Hamming.py b/tests/distance/test_Hamming.py index 189ee2f4..a2975ef1 100644 --- a/tests/distance/test_Hamming.py +++ b/tests/distance/test_Hamming.py @@ -1,12 +1,4 @@ -from rapidfuzz.distance import Hamming_cpp, Hamming_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": max(len(s1), len(s2)), "symmetric": True} - - -Hamming = GenericScorer(Hamming_py, Hamming_cpp, get_scorer_flags) +from .common import Hamming def test_basic(): diff --git a/tests/distance/test_Indel.py b/tests/distance/test_Indel.py index d32c5bcf..8673d73f 100644 --- a/tests/distance/test_Indel.py +++ b/tests/distance/test_Indel.py @@ -1,12 +1,4 @@ -from rapidfuzz.distance import Indel_cpp, Indel_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": len(s1) + len(s2), "symmetric": True} - - -Indel = GenericScorer(Indel_py, Indel_cpp, get_scorer_flags) +from .common import Indel def test_basic(): diff --git a/tests/distance/test_Jaro.py b/tests/distance/test_Jaro.py index f9ed99ab..00109de8 100644 --- a/tests/distance/test_Jaro.py +++ b/tests/distance/test_Jaro.py @@ -1,14 +1,5 @@ import pytest - -from rapidfuzz.distance import Jaro_cpp, Jaro_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": 1.0, "symmetric": True} - - -Jaro = GenericScorer(Jaro_py, Jaro_cpp, get_scorer_flags) +from .common import Jaro def test_hash_special_case(): diff --git a/tests/distance/test_JaroWinkler.py b/tests/distance/test_JaroWinkler.py index ac175010..0d634fdf 100644 --- a/tests/distance/test_JaroWinkler.py +++ b/tests/distance/test_JaroWinkler.py @@ -1,14 +1,5 @@ import pytest - -from rapidfuzz.distance import JaroWinkler_cpp, JaroWinkler_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": 1.0, "symmetric": True} - - -JaroWinkler = GenericScorer(JaroWinkler_py, JaroWinkler_cpp, get_scorer_flags) +from .common import JaroWinkler def test_hash_special_case(): diff --git a/tests/distance/test_LCSseq.py b/tests/distance/test_LCSseq.py index e1264bb6..d735f5ef 100644 --- a/tests/distance/test_LCSseq.py +++ b/tests/distance/test_LCSseq.py @@ -1,12 +1,4 @@ -from rapidfuzz.distance import LCSseq_cpp, LCSseq_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": max(len(s1), len(s2)), "symmetric": True} - - -LCSseq = GenericScorer(LCSseq_py, LCSseq_cpp, get_scorer_flags) +from .common import LCSseq def test_basic(): diff --git a/tests/distance/test_Levenshtein.py b/tests/distance/test_Levenshtein.py index bdcb7689..d4bc9a3f 100644 --- a/tests/distance/test_Levenshtein.py +++ b/tests/distance/test_Levenshtein.py @@ -1,6 +1,6 @@ from rapidfuzz import process from rapidfuzz.distance import Levenshtein_cpp, Levenshtein_py, Opcode, Opcodes -from ..common import GenericScorer +from .common import Levenshtein class CustomHashable: @@ -17,25 +17,6 @@ def __hash__(self): return hash(self._string) -def get_scorer_flags(s1, s2, weights=(1, 1, 1), **kwargs): - insert_cost, delete_cost, replace_cost = weights - max_dist = len(s1) * delete_cost + len(s2) * insert_cost - - if len(s1) >= len(s2): - max_dist = min( - max_dist, len(s2) * replace_cost + (len(s1) - len(s2)) * delete_cost - ) - else: - max_dist = min( - max_dist, len(s1) * replace_cost + (len(s2) - len(s1)) * insert_cost - ) - - return {"maximum": max_dist, "symmetric": insert_cost == delete_cost} - - -Levenshtein = GenericScorer(Levenshtein_py, Levenshtein_cpp, get_scorer_flags) - - def test_empty_string(): """ when both strings are empty this is a perfect match diff --git a/tests/distance/test_OSA.py b/tests/distance/test_OSA.py index 575cad49..273a329b 100644 --- a/tests/distance/test_OSA.py +++ b/tests/distance/test_OSA.py @@ -1,5 +1,5 @@ from rapidfuzz.distance import OSA_cpp, OSA_py -from ..common import GenericScorer +from .common import OSA def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): @@ -20,13 +20,6 @@ def __hash__(self): return hash(self._string) -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": max(len(s1), len(s2)), "symmetric": True} - - -OSA = GenericScorer(OSA_py, OSA_cpp, get_scorer_flags) - - def test_empty_string(): """ when both strings are empty this is a perfect match diff --git a/tests/distance/test_Postfix.py b/tests/distance/test_Postfix.py index 773fc7e5..6fdbf71c 100644 --- a/tests/distance/test_Postfix.py +++ b/tests/distance/test_Postfix.py @@ -1,12 +1,4 @@ -from rapidfuzz.distance import Postfix_cpp, Postfix_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": max(len(s1), len(s2)), "symmetric": True} - - -Postfix = GenericScorer(Postfix_py, Postfix_cpp, get_scorer_flags) +from .common import Postfix def test_basic(): diff --git a/tests/distance/test_Prefix.py b/tests/distance/test_Prefix.py index 31d65ba4..125823cd 100644 --- a/tests/distance/test_Prefix.py +++ b/tests/distance/test_Prefix.py @@ -1,12 +1,4 @@ -from rapidfuzz.distance import Prefix_cpp, Prefix_py -from ..common import GenericScorer - - -def get_scorer_flags(s1, s2, **kwargs): - return {"maximum": max(len(s1), len(s2)), "symmetric": True} - - -Prefix = GenericScorer(Prefix_py, Prefix_cpp, get_scorer_flags) +from .common import Prefix def test_basic(): diff --git a/tests/distance/test_distance.py b/tests/distance/test_distance.py new file mode 100644 index 00000000..cc2791a7 --- /dev/null +++ b/tests/distance/test_distance.py @@ -0,0 +1,20 @@ +import pytest +from .common import all_scorer_modules + + +@pytest.mark.parametrize("scorer", all_scorer_modules) +def test_none(scorer): + """ + All normalized scorers should be able to handle None values + """ + assert scorer.normalized_distance(None, "test") == 1.0 + assert scorer.normalized_similarity(None, "test") == 0.0 + + +@pytest.mark.parametrize("scorer", all_scorer_modules) +def test_nan(scorer): + """ + All normalized scorers should be able to handle float("nan") + """ + assert scorer.normalized_distance(float("nan"), "test") == 1.0 + assert scorer.normalized_similarity(float("nan"), "test") == 0.0 diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index 4c015914..1145c6fb 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -341,6 +341,15 @@ def test_none_string(scorer): assert scorer(None, "test") == 0 +@pytest.mark.parametrize("scorer", scorers) +def test_nan_string(scorer): + """ + when float("nan") is passed to a scorer the result should always be 0 + """ + assert scorer("test", float("nan")) == 0 + assert scorer(float("nan"), "test") == 0 + + @pytest.mark.parametrize("scorer", scorers) def test_simple_unicode_tests(scorer): """ diff --git a/tests/test_process.py b/tests/test_process.py index d74c35a0..6f726790 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -2,14 +2,17 @@ from rapidfuzz import fuzz, process_cpp, process_py + def wrapped(func): from functools import wraps + @wraps(func) def decorator(*args, **kwargs): return 100 return decorator + class process: @staticmethod def extract_iter(*args, **kwargs):