Skip to content

Commit

Permalink
add score_multiplier
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 21, 2023
1 parent e3e2f89 commit 27f178b
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 48 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ Changelog
Changed
~~~~~~~
* skip pandas ``pd.NA`` similar to ``None``
* add ``score_multiplier`` argument to ``process.cdist`` which allows multiplying the end result scores
with a constant factor.

Performance
~~~~~~~~~~~
* improve performance of simd implementation for ``Indel`` / ``Jaro`` / ``JaroWinkler``


[3.4.0] - 2023-10-09
^^^^^^^^^^^^^^^^^^^^
Changed
Expand Down Expand Up @@ -58,7 +59,6 @@ Fixed
~~~~~
* replace usage of ``isnan`` with ``std::isnan`` which fixes the build on NetBSD


[3.1.0] - 2023-06-02
^^^^^^^^^^^^^^^^^^^^
Changed
Expand Down
3 changes: 2 additions & 1 deletion src/rapidfuzz/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@

try:
from pandas import NA as pandas_NA
except:
except BaseException:
pandas_NA = None


class ScorerFlag(IntFlag):
RESULT_F64 = 1 << 5
RESULT_I64 = 1 << 6
Expand Down
25 changes: 10 additions & 15 deletions src/rapidfuzz/distance/metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,9 +514,7 @@ static inline bool JaroDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*, int64
const RF_String* str)
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_distance_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
if (str_count != 1) return multi_distance_init<rf::experimental::MultiJaro, double>(self, str_count, str);
#endif

return distance_init<rf::CachedJaro, double>(self, str_count, str);
Expand All @@ -534,8 +532,7 @@ static inline bool JaroNormalizedDistanceInit(RF_ScorerFunc* self, const RF_Kwar
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_distance_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
return multi_normalized_distance_init<rf::experimental::MultiJaro, double>(self, str_count, str);
#endif

return normalized_distance_init<rf::CachedJaro, double>(self, str_count, str);
Expand All @@ -552,8 +549,7 @@ static inline bool JaroSimilarityInit(RF_ScorerFunc* self, const RF_Kwargs*, int
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_similarity_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
return multi_similarity_init<rf::experimental::MultiJaro, double>(self, str_count, str);
#endif

return similarity_init<rf::CachedJaro, double>(self, str_count, str);
Expand All @@ -571,8 +567,7 @@ static inline bool JaroNormalizedSimilarityInit(RF_ScorerFunc* self, const RF_Kw
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_similarity_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
return multi_normalized_similarity_init<rf::experimental::MultiJaro, double>(self, str_count, str);
#endif

return normalized_similarity_init<rf::CachedJaro, double>(self, str_count, str);
Expand Down Expand Up @@ -602,8 +597,8 @@ static inline bool JaroWinklerDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_distance_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
return multi_distance_init<rf::experimental::MultiJaroWinkler, double>(self, str_count, str,
prefix_weight);
#endif

return distance_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
Expand All @@ -623,8 +618,8 @@ static inline bool JaroWinklerNormalizedDistanceInit(RF_ScorerFunc* self, const

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_distance_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
return multi_normalized_distance_init<rf::experimental::MultiJaroWinkler, double>(self, str_count,
str, prefix_weight);
#endif

return normalized_distance_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
Expand All @@ -644,8 +639,8 @@ static inline bool JaroWinklerSimilarityInit(RF_ScorerFunc* self, const RF_Kwarg

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_similarity_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
return multi_similarity_init<rf::experimental::MultiJaroWinkler, double>(self, str_count, str,
prefix_weight);
#endif

return similarity_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
Expand Down
17 changes: 9 additions & 8 deletions src/rapidfuzz/process_cpp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ template <typename T>
static Matrix cdist_single_list_impl(const RF_ScorerFlags* scorer_flags, const RF_Kwargs* kwargs,
RF_Scorer* scorer, const std::vector<RF_StringWrapper>& queries,
MatrixType dtype, int workers, T score_cutoff, T score_hint,
T worst_score)
T score_multiplier, T worst_score)
{
(void)scorer_flags;
int64_t rows = queries.size();
Expand All @@ -473,16 +473,16 @@ static Matrix cdist_single_list_impl(const RF_ScorerFlags* scorer_flags, const R
else
ScorerFunc.call(&queries[row].string, score_cutoff, score_hint, &score);

matrix.set(row, row, score);
matrix.set(row, row, score * score_multiplier);

for (int64_t col = row + 1; col < cols; ++col) {
if (queries[col].is_none())
score = worst_score;
else
ScorerFunc.call(&queries[col].string, score_cutoff, score_hint, &score);

matrix.set(row, col, score);
matrix.set(col, row, score);
matrix.set(row, col, score * score_multiplier);
matrix.set(col, row, score * score_multiplier);
}
}
});
Expand All @@ -494,7 +494,8 @@ template <typename T>
static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_Kwargs* kwargs,
RF_Scorer* scorer, const std::vector<RF_StringWrapper>& queries,
const std::vector<RF_StringWrapper>& choices, MatrixType dtype,
int workers, T score_cutoff, T score_hint, T worst_score)
int workers, T score_cutoff, T score_hint, T score_multiplier,
T worst_score)
{
int64_t rows = queries.size();
int64_t cols = choices.size();
Expand Down Expand Up @@ -564,7 +565,7 @@ static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_
else
ScorerFunc.call(&choices[col].string, score_cutoff, score_hint, &score);

matrix.set(row_idx[row], col, score);
matrix.set(row_idx[row], col, score * score_multiplier);
}
}

Expand Down Expand Up @@ -592,7 +593,7 @@ static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_
}

for (int64_t i = 0; i < row_count; ++i)
matrix.set(row_idx[row + i], col, scores[i]);
matrix.set(row_idx[row + i], col, scores[i] * score_multiplier);
}
});
}
Expand All @@ -610,7 +611,7 @@ static Matrix cdist_two_lists_impl(const RF_ScorerFlags* scorer_flags, const RF_
else
ScorerFunc.call(&choices[col].string, score_cutoff, score_hint, &score);

matrix.set(row, col, score);
matrix.set(row, col, score * score_multiplier);
}
}
});
Expand Down
2 changes: 2 additions & 0 deletions src/rapidfuzz/process_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def cdist(
processor: Callable[..., Sequence[Hashable]] | None = None,
score_cutoff: int | float | None = None,
score_hint: int | float | None = None,
score_multiplier: int | float = 1,
dtype: np.dtype | None = None,
workers: int = 1,
**kwargs: Any,
Expand All @@ -78,6 +79,7 @@ def cdist(
processor=processor,
score_cutoff=score_cutoff,
score_hint=score_hint,
score_multiplier=score_multiplier,
dtype=dtype,
workers=workers,
**kwargs,
Expand Down
22 changes: 15 additions & 7 deletions src/rapidfuzz/process_cpp_impl.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,7 @@ cdef Matrix cdist_two_lists(
processor,
score_cutoff,
score_hint,
score_multiplier,
dtype,
int c_workers,
const RF_Kwargs* scorer_kwargs
Expand All @@ -1435,7 +1436,8 @@ cdef Matrix cdist_two_lists(
c_workers,
get_score_cutoff_f64(score_cutoff, scorer_flags),
get_score_cutoff_f64(score_hint, scorer_flags),
scorer_flags.worst_score.f64
<double>score_multiplier,
scorer_flags.worst_score.f64,
)

elif flags & RF_SCORER_FLAG_RESULT_I64:
Expand All @@ -1446,6 +1448,7 @@ cdef Matrix cdist_two_lists(
c_workers,
get_score_cutoff_i64(score_cutoff, scorer_flags),
get_score_cutoff_i64(score_hint, scorer_flags),
<int64_t>score_multiplier,
scorer_flags.worst_score.i64
)
else:
Expand All @@ -1460,6 +1463,7 @@ cdef Matrix cdist_single_list(
processor,
score_cutoff,
score_hint,
score_multiplier,
dtype,
int c_workers,
const RF_Kwargs* scorer_kwargs
Expand All @@ -1476,6 +1480,7 @@ cdef Matrix cdist_single_list(
c_workers,
get_score_cutoff_f64(score_cutoff, scorer_flags),
get_score_cutoff_f64(score_hint, scorer_flags),
<double>score_multiplier,
scorer_flags.worst_score.f64
)

Expand All @@ -1487,6 +1492,7 @@ cdef Matrix cdist_single_list(
c_workers,
get_score_cutoff_i64(score_cutoff, scorer_flags),
get_score_cutoff_i64(score_hint, scorer_flags),
<int64_t>score_multiplier,
scorer_flags.worst_score.i64
)
else:
Expand All @@ -1497,7 +1503,7 @@ cdef Matrix cdist_single_list(

@cython.boundscheck(False)
@cython.wraparound(False)
cdef cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers, dict scorer_kwargs):
cdef cdist_py(queries, choices, scorer, processor, score_cutoff, score_multiplier, dtype, workers, dict scorer_kwargs):
# todo this should handle two similar sequences more efficiently

proc_queries = preprocess_py(queries, processor)
Expand All @@ -1512,12 +1518,12 @@ cdef cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers,
for i in range(proc_queries.size()):
for j in range(proc_choices.size()):
score = scorer(<object>proc_queries[i].obj, <object>proc_choices[j].obj, **scorer_kwargs)
matrix.matrix.set(i, j, score)
matrix.matrix.set(i, j, score * <double>score_multiplier)

return matrix


def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, score_hint=None, dtype=None, workers=1, scorer_kwargs=None):
def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, score_hint=None, score_multiplier=1, dtype=None, workers=1, scorer_kwargs=None):
cdef RF_Scorer* scorer_context = NULL
cdef RF_ScorerFlags scorer_flags
cdef bool is_orig_scorer
Expand All @@ -1539,10 +1545,12 @@ def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None,
if scorer_flags.flags & RF_SCORER_FLAG_SYMMETRIC and queries is choices:
return cdist_single_list(
queries, scorer_context, &scorer_flags, processor,
score_cutoff, score_hint, dtype, workers, &kwargs_context.kwargs)
score_cutoff, score_hint, score_multiplier,
dtype, workers, &kwargs_context.kwargs)
else:
return cdist_two_lists(
queries, choices, scorer_context, &scorer_flags, processor,
score_cutoff, score_hint, dtype, workers, &kwargs_context.kwargs)
score_cutoff, score_hint, score_multiplier,
dtype, workers, &kwargs_context.kwargs)

return cdist_py(queries, choices, scorer, processor, score_cutoff, dtype, workers, scorer_kwargs)
return cdist_py(queries, choices, scorer, processor, score_cutoff, score_multiplier, dtype, workers, scorer_kwargs)
52 changes: 40 additions & 12 deletions src/rapidfuzz/process_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from __future__ import annotations

import heapq
import numbers
from contextlib import suppress
from math import isnan
from typing import (
Any,
Callable,
Expand All @@ -30,6 +30,7 @@ def _get_scorer_flags_py(scorer: Any, scorer_kwargs: dict[str, Any]) -> tuple[in
return (flags["worst_score"], flags["optimal_score"])
return (0, 100)


@overload
def extract_iter(
query: Sequence[Hashable] | None,
Expand Down Expand Up @@ -535,6 +536,7 @@ def cdist(
processor: Callable[..., Sequence[Hashable]] | None = None,
score_cutoff: int | float | None = None,
score_hint: int | float | None = None,
score_multiplier: int | float = 1,
dtype: np.dtype | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -564,6 +566,11 @@ def cdist(
Optional argument for an expected score to be passed to the scorer.
This is used to select a faster implementation. Default is None,
which deactivates this behaviour.
score_multiplier: Any, optional
Optional argument to multiply the calculated score with. This is applied as the final step,
so e.g. score_cutoff is applied on the unmodified score. This is mostly useful to map from
a floating point range to an integer to reduce the memory usage. Default is 1,
which deactivates this behaviour.
dtype : data-type, optional
The desired data-type for the result array.Depending on the scorer type the following
dtypes are supported:
Expand Down Expand Up @@ -604,23 +611,44 @@ def cdist(

if queries is choices and _is_symmetric(scorer, scorer_kwargs):
for i, query in enumerate(proc_choices):
results[i, i] = scorer(query, query, score_cutoff=score_cutoff, **scorer_kwargs)
score = scorer(query, query, score_cutoff=score_cutoff, **scorer_kwargs) * score_multiplier

if issubclass(dtype, numbers.Integral):
score = round(score)

results[i, i] = score
for j in range(i + 1, len(proc_choices)):
results[i, j] = results[j, i] = scorer(
query,
proc_choices[j],
score_cutoff=score_cutoff,
**scorer_kwargs,
score = (
scorer(
query,
proc_choices[j],
score_cutoff=score_cutoff,
**scorer_kwargs,
)
* score_multiplier
)

if issubclass(dtype, numbers.Integral):
score = round(score)

results[i, j] = results[j, i] = score
else:
for i, query in enumerate(queries):
proc_query = processor(query) if (processor and not is_none(query)) else query
for j, choice in enumerate(proc_choices):
results[i, j] = scorer(
proc_query,
choice,
score_cutoff=score_cutoff,
**scorer_kwargs,
score = (
scorer(
proc_query,
choice,
score_cutoff=score_cutoff,
**scorer_kwargs,
)
* score_multiplier
)

if issubclass(dtype, numbers.Integral):
score = round(score)

results[i, j] = score

return results
3 changes: 2 additions & 1 deletion tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@

try:
from pandas import NA as pandas_NA
except:
except BaseException:
pandas_NA = None


def _get_scorer_flags_py(scorer: Any, scorer_kwargs: dict[str, Any]) -> tuple[int, int]:
params = getattr(scorer, "_RF_ScorerPy", None)
if params is not None:
Expand Down
Loading

0 comments on commit 27f178b

Please sign in to comment.