add score_multiplier

rapidfuzz · Oct 21, 2023 · 27f178b · 27f178b
1 parent e3e2f89
commit 27f178b
Show file tree

Hide file tree

Showing 10 changed files with 112 additions and 48 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,12 +6,13 @@ Changelog
 Changed
 ~~~~~~~
 * skip pandas ``pd.NA`` similar to ``None``
+* add ``score_multiplier`` argument to ``process.cdist`` which allows multiplying the end result scores
+  with a constant factor.
 
 Performance
 ~~~~~~~~~~~
 * improve performance of simd implementation for ``Indel`` / ``Jaro`` / ``JaroWinkler``
 
-
 [3.4.0] - 2023-10-09
 ^^^^^^^^^^^^^^^^^^^^
 Changed
@@ -58,7 +59,6 @@ Fixed
 ~~~~~
 * replace usage of ``isnan`` with ``std::isnan`` which fixes the build on NetBSD
 
-
 [3.1.0] - 2023-06-02
 ^^^^^^^^^^^^^^^^^^^^
 Changed

diff --git a/src/rapidfuzz/_utils.py b/src/rapidfuzz/_utils.py
@@ -13,9 +13,10 @@
 
 try:
     from pandas import NA as pandas_NA
-except:
+except BaseException:
     pandas_NA = None
 
+
 class ScorerFlag(IntFlag):
     RESULT_F64 = 1 << 5
     RESULT_I64 = 1 << 6

diff --git a/src/rapidfuzz/distance/metrics.hpp b/src/rapidfuzz/distance/metrics.hpp
@@ -514,9 +514,7 @@ static inline bool JaroDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*, int64
                                     const RF_String* str)
 {
 #ifdef RAPIDFUZZ_X64
-    if (str_count != 1)
-        return multi_distance_init<rf::experimental::MultiJaro, double>(
-            self, str_count, str);
+    if (str_count != 1) return multi_distance_init<rf::experimental::MultiJaro, double>(self, str_count, str);
 #endif
 
     return distance_init<rf::CachedJaro, double>(self, str_count, str);
@@ -534,8 +532,7 @@ static inline bool JaroNormalizedDistanceInit(RF_ScorerFunc* self, const RF_Kwar
 {
 #ifdef RAPIDFUZZ_X64
     if (str_count != 1)
-        return multi_normalized_distance_init<rf::experimental::MultiJaro, double>(
-            self, str_count, str);
+        return multi_normalized_distance_init<rf::experimental::MultiJaro, double>(self, str_count, str);
 #endif
 
     return normalized_distance_init<rf::CachedJaro, double>(self, str_count, str);
@@ -552,8 +549,7 @@ static inline bool JaroSimilarityInit(RF_ScorerFunc* self, const RF_Kwargs*, int
 {
 #ifdef RAPIDFUZZ_X64
     if (str_count != 1)
-        return multi_similarity_init<rf::experimental::MultiJaro, double>(
-            self, str_count, str);
+        return multi_similarity_init<rf::experimental::MultiJaro, double>(self, str_count, str);
 #endif
 
     return similarity_init<rf::CachedJaro, double>(self, str_count, str);
@@ -571,8 +567,7 @@ static inline bool JaroNormalizedSimilarityInit(RF_ScorerFunc* self, const RF_Kw
 {
 #ifdef RAPIDFUZZ_X64
     if (str_count != 1)
-        return multi_normalized_similarity_init<rf::experimental::MultiJaro, double>(
-            self, str_count, str);
+        return multi_normalized_similarity_init<rf::experimental::MultiJaro, double>(self, str_count, str);
 #endif
 
     return normalized_similarity_init<rf::CachedJaro, double>(self, str_count, str);
@@ -602,8 +597,8 @@ static inline bool JaroWinklerDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*
 
 #ifdef RAPIDFUZZ_X64
     if (str_count != 1)
-        return multi_distance_init<rf::experimental::MultiJaroWinkler, double>(
-            self, str_count, str, prefix_weight);
+        return multi_distance_init<rf::experimental::MultiJaroWinkler, double>(self, str_count, str,
+                                                                               prefix_weight);
 #endif
 
     return distance_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
@@ -623,8 +618,8 @@ static inline bool JaroWinklerNormalizedDistanceInit(RF_ScorerFunc* self, const
 
 #ifdef RAPIDFUZZ_X64
     if (str_count != 1)
-        return multi_normalized_distance_init<rf::experimental::MultiJaroWinkler, double>(
-            self, str_count, str, prefix_weight);
+        return multi_normalized_distance_init<rf::experimental::MultiJaroWinkler, double>(self, str_count,
+                                                                                          str, prefix_weight);
 #endif
 
     return normalized_distance_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
@@ -644,8 +639,8 @@ static inline bool JaroWinklerSimilarityInit(RF_ScorerFunc* self, const RF_Kwarg
 
 #ifdef RAPIDFUZZ_X64
     if (str_count != 1)
-        return multi_similarity_init<rf::experimental::MultiJaroWinkler, double>(
-            self, str_count, str, prefix_weight);
+        return multi_similarity_init<rf::experimental::MultiJaroWinkler, double>(self, str_count, str,
+                                                                                 prefix_weight);
 #endif
 
     return similarity_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);

diff --git a/src/rapidfuzz/process_cpp.hpp b/src/rapidfuzz/process_cpp.hpp
@@ -454,7 +454,7 @@ template <typename T>
 static Matrix cdist_single_list_impl(const RF_ScorerFlags* scorer_flags, const RF_Kwargs* kwargs,
                                      RF_Scorer* scorer, const std::vector<RF_StringWrapper>& queries,
                                      MatrixType dtype, int workers, T score_cutoff, T score_hint,
-                                     T worst_score)
+                                     T score_multiplier, T worst_score)
 {
     (void)scorer_flags;
     int64_t rows = queries.size();
@@ -473,16 +473,16 @@ static Matrix cdist_single_list_impl(const RF_ScorerFlags* scorer_flags, const R
             else
                 ScorerFunc.call(&queries[row].string, score_cutoff, score_hint, &score);
 
-            matrix.set(row, row, score);
+            matrix.set(row, row, score * score_multiplier);
 
             for (int64_t col = row + 1; col < cols; ++col) {
                 if (queries[col].is_none())
                     score = worst_score;
                 else
                     ScorerFunc.call(&queries[col].string, score_cutoff, score_hint, &score);
 
-                matrix.set(row, col, score);
-                matrix.set(col, row, score);
+                matrix.set(row, col, score * score_multiplier);
+                matrix.set(col, row, score * score_multiplier);
             }
         }
     });
@@ -494,7 +494,8 @@ template <typename T>
 static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_Kwargs* kwargs,
                                    RF_Scorer* scorer, const std::vector<RF_StringWrapper>& queries,
                                    const std::vector<RF_StringWrapper>& choices, MatrixType dtype,
-                                   int workers, T score_cutoff, T score_hint, T worst_score)
+                                   int workers, T score_cutoff, T score_hint, T score_multiplier,
+                                   T worst_score)
 {
     int64_t rows = queries.size();
     int64_t cols = choices.size();
@@ -564,7 +565,7 @@ static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_
                     else
                         ScorerFunc.call(&choices[col].string, score_cutoff, score_hint, &score);
 
-                    matrix.set(row_idx[row], col, score);
+                    matrix.set(row_idx[row], col, score * score_multiplier);
                 }
             }
 
@@ -592,7 +593,7 @@ static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_
                 }
 
                 for (int64_t i = 0; i < row_count; ++i)
-                    matrix.set(row_idx[row + i], col, scores[i]);
+                    matrix.set(row_idx[row + i], col, scores[i] * score_multiplier);
             }
         });
     }
@@ -610,7 +611,7 @@ static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_
                     else
                         ScorerFunc.call(&choices[col].string, score_cutoff, score_hint, &score);
 
-                    matrix.set(row, col, score);
+                    matrix.set(row, col, score * score_multiplier);
                 }
             }
         });

diff --git a/src/rapidfuzz/process_cpp.py b/src/rapidfuzz/process_cpp.py
@@ -63,6 +63,7 @@ def cdist(
     processor: Callable[..., Sequence[Hashable]] | None = None,
     score_cutoff: int | float | None = None,
     score_hint: int | float | None = None,
+    score_multiplier: int | float = 1,
     dtype: np.dtype | None = None,
     workers: int = 1,
     **kwargs: Any,
@@ -78,6 +79,7 @@ def cdist(
             processor=processor,
             score_cutoff=score_cutoff,
             score_hint=score_hint,
+            score_multiplier=score_multiplier,
             dtype=dtype,
             workers=workers,
             **kwargs,

diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx
@@ -1418,6 +1418,7 @@ cdef Matrix cdist_two_lists(
     processor,
     score_cutoff,
     score_hint,
+    score_multiplier,
     dtype,
     int c_workers,
     const RF_Kwargs* scorer_kwargs
@@ -1435,7 +1436,8 @@ cdef Matrix cdist_two_lists(
             c_workers,
             get_score_cutoff_f64(score_cutoff, scorer_flags),
             get_score_cutoff_f64(score_hint, scorer_flags),
-            scorer_flags.worst_score.f64
+            <double>score_multiplier,
+            scorer_flags.worst_score.f64,
         )
 
     elif flags & RF_SCORER_FLAG_RESULT_I64:
@@ -1446,6 +1448,7 @@ cdef Matrix cdist_two_lists(
             c_workers,
             get_score_cutoff_i64(score_cutoff, scorer_flags),
             get_score_cutoff_i64(score_hint, scorer_flags),
+            <int64_t>score_multiplier,
             scorer_flags.worst_score.i64
         )
     else:
@@ -1460,6 +1463,7 @@ cdef Matrix cdist_single_list(
     processor,
     score_cutoff,
     score_hint,
+    score_multiplier,
     dtype,
     int c_workers,
     const RF_Kwargs* scorer_kwargs
@@ -1476,6 +1480,7 @@ cdef Matrix cdist_single_list(
             c_workers,
             get_score_cutoff_f64(score_cutoff, scorer_flags),
             get_score_cutoff_f64(score_hint, scorer_flags),
+            <double>score_multiplier,
             scorer_flags.worst_score.f64
         )
 
@@ -1487,6 +1492,7 @@ cdef Matrix cdist_single_list(
             c_workers,
             get_score_cutoff_i64(score_cutoff, scorer_flags),
             get_score_cutoff_i64(score_hint, scorer_flags),
+            <int64_t>score_multiplier,
             scorer_flags.worst_score.i64
         )
     else:
@@ -1497,7 +1503,7 @@ cdef Matrix cdist_single_list(
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-cdef cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers, dict scorer_kwargs):
+cdef cdist_py(queries, choices, scorer, processor, score_cutoff, score_multiplier, dtype, workers, dict scorer_kwargs):
     # todo this should handle two similar sequences more efficiently
 
     proc_queries = preprocess_py(queries, processor)
@@ -1512,12 +1518,12 @@ cdef cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers,
     for i in range(proc_queries.size()):
         for j in range(proc_choices.size()):
             score = scorer(<object>proc_queries[i].obj, <object>proc_choices[j].obj, **scorer_kwargs)
-            matrix.matrix.set(i, j, score)
+            matrix.matrix.set(i, j, score * <double>score_multiplier)
 
     return matrix
 
 
-def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, score_hint=None, dtype=None, workers=1, scorer_kwargs=None):
+def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, score_hint=None, score_multiplier=1, dtype=None, workers=1, scorer_kwargs=None):
     cdef RF_Scorer* scorer_context = NULL
     cdef RF_ScorerFlags scorer_flags
     cdef bool is_orig_scorer
@@ -1539,10 +1545,12 @@ def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None,
         if scorer_flags.flags & RF_SCORER_FLAG_SYMMETRIC and queries is choices:
             return cdist_single_list(
                 queries, scorer_context, &scorer_flags, processor,
-                score_cutoff, score_hint, dtype, workers, &kwargs_context.kwargs)
+                score_cutoff, score_hint, score_multiplier,
+                dtype, workers, &kwargs_context.kwargs)
         else:
             return cdist_two_lists(
                 queries, choices, scorer_context, &scorer_flags, processor,
-                score_cutoff, score_hint, dtype, workers, &kwargs_context.kwargs)
+                score_cutoff, score_hint, score_multiplier,
+                dtype, workers, &kwargs_context.kwargs)
 
-    return cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers, scorer_kwargs)
+    return cdist_py(queries, choices, scorer, processor, score_cutoff, score_multiplier, dtype, workers, scorer_kwargs)
diff --git a/src/rapidfuzz/process_py.py b/src/rapidfuzz/process_py.py
@@ -4,8 +4,8 @@
 from __future__ import annotations
 
 import heapq
+import numbers
 from contextlib import suppress
-from math import isnan
 from typing import (
     Any,
     Callable,
@@ -30,6 +30,7 @@ def _get_scorer_flags_py(scorer: Any, scorer_kwargs: dict[str, Any]) -> tuple[in
         return (flags["worst_score"], flags["optimal_score"])
     return (0, 100)
 
+
 @overload
 def extract_iter(
     query: Sequence[Hashable] | None,
@@ -535,6 +536,7 @@ def cdist(
     processor: Callable[..., Sequence[Hashable]] | None = None,
     score_cutoff: int | float | None = None,
     score_hint: int | float | None = None,
+    score_multiplier: int | float = 1,
     dtype: np.dtype | None = None,
     workers: int = 1,
     scorer_kwargs: dict[str, Any] | None = None,
@@ -564,6 +566,11 @@ def cdist(
         Optional argument for an expected score to be passed to the scorer.
         This is used to select a faster implementation. Default is None,
         which deactivates this behaviour.
+    score_multiplier: Any, optional
+        Optional argument to multiply the calculated score with. This is applied as the final step,
+        so e.g. score_cutoff is applied on the unmodified score. This is mostly useful to map from
+        a floating point range to an integer to reduce the memory usage. Default is 1,
+        which deactivates this behaviour.
     dtype : data-type, optional
         The desired data-type for the result array.Depending on the scorer type the following
         dtypes are supported:
@@ -604,23 +611,44 @@ def cdist(
 
     if queries is choices and _is_symmetric(scorer, scorer_kwargs):
         for i, query in enumerate(proc_choices):
-            results[i, i] = scorer(query, query, score_cutoff=score_cutoff, **scorer_kwargs)
+            score = scorer(query, query, score_cutoff=score_cutoff, **scorer_kwargs) * score_multiplier
+
+            if issubclass(dtype, numbers.Integral):
+                score = round(score)
+
+            results[i, i] = score
             for j in range(i + 1, len(proc_choices)):
-                results[i, j] = results[j, i] = scorer(
-                    query,
-                    proc_choices[j],
-                    score_cutoff=score_cutoff,
-                    **scorer_kwargs,
+                score = (
+                    scorer(
+                        query,
+                        proc_choices[j],
+                        score_cutoff=score_cutoff,
+                        **scorer_kwargs,
+                    )
+                    * score_multiplier
                 )
+
+                if issubclass(dtype, numbers.Integral):
+                    score = round(score)
+
+                results[i, j] = results[j, i] = score
     else:
         for i, query in enumerate(queries):
             proc_query = processor(query) if (processor and not is_none(query)) else query
             for j, choice in enumerate(proc_choices):
-                results[i, j] = scorer(
-                    proc_query,
-                    choice,
-                    score_cutoff=score_cutoff,
-                    **scorer_kwargs,
+                score = (
+                    scorer(
+                        proc_query,
+                        choice,
+                        score_cutoff=score_cutoff,
+                        **scorer_kwargs,
+                    )
+                    * score_multiplier
                 )
 
+                if issubclass(dtype, numbers.Integral):
+                    score = round(score)
+
+                results[i, j] = score
+
     return results
diff --git a/tests/common.py b/tests/common.py
@@ -13,9 +13,10 @@
 
 try:
     from pandas import NA as pandas_NA
-except:
+except BaseException:
     pandas_NA = None
 
+
 def _get_scorer_flags_py(scorer: Any, scorer_kwargs: dict[str, Any]) -> tuple[int, int]:
     params = getattr(scorer, "_RF_ScorerPy", None)
     if params is not None: