From 67a093310853b27944c0d99ffddf1904fcd3654e Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 8 Oct 2023 18:24:42 +0200 Subject: [PATCH] add simd implementation of Jaro and JaroWinkler --- CHANGELOG.rst | 6 +++ extern/rapidfuzz-cpp | 2 +- extern/taskflow | 2 +- src/rapidfuzz/distance/metrics.hpp | 70 ++++++++++++++++++++++++++ src/rapidfuzz/distance/metrics_cpp.pyx | 20 +++++++- 5 files changed, 96 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index edb2b7a3..1d442892 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,12 @@ Changelog --------- +[3.4.0] - 2023-10- +^^^^^^^^^^^^^^^^^^^^ +Changed +~~~~~~~ +- add simd implementation for Jaro and Jaro Winkler + [3.3.1] - 2023-09-25 ^^^^^^^^^^^^^^^^^^^^ Added diff --git a/extern/rapidfuzz-cpp b/extern/rapidfuzz-cpp index 05946d7a..a2cc696e 160000 --- a/extern/rapidfuzz-cpp +++ b/extern/rapidfuzz-cpp @@ -1 +1 @@ -Subproject commit 05946d7ace0443e622d72a819d3d8f900e6252c1 +Subproject commit a2cc696ed691d64e29d18e77ec7a3ac82bec6400 diff --git a/extern/taskflow b/extern/taskflow index 12f8bd4e..f1490ffc 160000 --- a/extern/taskflow +++ b/extern/taskflow @@ -1 +1 @@ -Subproject commit 12f8bd4e970ab27fd3dee3bffa24b5b48b54ba39 +Subproject commit f1490ffc286eba418107c4aa9b8913e8ca76443c diff --git a/src/rapidfuzz/distance/metrics.hpp b/src/rapidfuzz/distance/metrics.hpp index 8fb9044a..6acad458 100644 --- a/src/rapidfuzz/distance/metrics.hpp +++ b/src/rapidfuzz/distance/metrics.hpp @@ -513,6 +513,12 @@ static inline double jaro_distance_func(const RF_String& str1, const RF_String& static inline bool JaroDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count, const RF_String* str) { +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_distance_init( + self, str_count, str); +#endif + return distance_init(self, str_count, str); } @@ -526,6 +532,12 @@ static inline double jaro_normalized_distance_func(const RF_String& str1, const static inline bool JaroNormalizedDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count, const RF_String* str) { +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_normalized_distance_init( + self, str_count, str); +#endif + return normalized_distance_init(self, str_count, str); } @@ -538,6 +550,12 @@ static inline double jaro_similarity_func(const RF_String& str1, const RF_String static inline bool JaroSimilarityInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count, const RF_String* str) { +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_similarity_init( + self, str_count, str); +#endif + return similarity_init(self, str_count, str); } @@ -551,9 +569,24 @@ static inline double jaro_normalized_similarity_func(const RF_String& str1, cons static inline bool JaroNormalizedSimilarityInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count, const RF_String* str) { +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_normalized_similarity_init( + self, str_count, str); +#endif + return normalized_similarity_init(self, str_count, str); } +static inline bool JaroMultiStringSupport(const RF_Kwargs*) +{ +#ifdef RAPIDFUZZ_X64 + return true; +#else + return false; +#endif +} + /* JaroWinkler */ static inline double jaro_winkler_distance_func(const RF_String& str1, const RF_String& str2, double prefix_weight, double score_cutoff) @@ -566,6 +599,13 @@ static inline bool JaroWinklerDistanceInit(RF_ScorerFunc* self, const RF_Kwargs* const RF_String* str) { double prefix_weight = *static_cast(kwargs->context); + +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_distance_init( + self, str_count, str, prefix_weight); +#endif + return distance_init(self, str_count, str, prefix_weight); } @@ -580,6 +620,13 @@ static inline bool JaroWinklerNormalizedDistanceInit(RF_ScorerFunc* self, const int64_t str_count, const RF_String* str) { double prefix_weight = *static_cast(kwargs->context); + +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_normalized_distance_init( + self, str_count, str, prefix_weight); +#endif + return normalized_distance_init(self, str_count, str, prefix_weight); } @@ -594,6 +641,13 @@ static inline bool JaroWinklerSimilarityInit(RF_ScorerFunc* self, const RF_Kwarg const RF_String* str) { double prefix_weight = *static_cast(kwargs->context); + +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_similarity_init( + self, str_count, str, prefix_weight); +#endif + return similarity_init(self, str_count, str, prefix_weight); } @@ -608,9 +662,25 @@ static inline bool JaroWinklerNormalizedSimilarityInit(RF_ScorerFunc* self, cons int64_t str_count, const RF_String* str) { double prefix_weight = *static_cast(kwargs->context); + +#ifdef RAPIDFUZZ_X64 + if (str_count != 1) + return multi_normalized_similarity_init( + self, str_count, str, prefix_weight); +#endif + return normalized_similarity_init(self, str_count, str, prefix_weight); } +static inline bool JaroWinklerMultiStringSupport(const RF_Kwargs*) +{ +#ifdef RAPIDFUZZ_X64 + return true; +#else + return false; +#endif +} + /* Prefix */ static inline int64_t prefix_distance_func(const RF_String& str1, const RF_String& str2, int64_t score_cutoff) { diff --git a/src/rapidfuzz/distance/metrics_cpp.pyx b/src/rapidfuzz/distance/metrics_cpp.pyx index fcc25a99..9d0cd933 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyx +++ b/src/rapidfuzz/distance/metrics_cpp.pyx @@ -129,7 +129,7 @@ cdef extern from "metrics.hpp": bool OSAMultiStringSupport(const RF_Kwargs*) nogil - # Damerau Levenshtein + # Jaro double jaro_normalized_distance_func( const RF_String&, const RF_String&, double) except + nogil double jaro_distance_func( const RF_String&, const RF_String&, double) except + nogil double jaro_normalized_similarity_func(const RF_String&, const RF_String&, double) except + nogil @@ -140,7 +140,9 @@ cdef extern from "metrics.hpp": bool JaroSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil bool JaroNormalizedSimilarityInit(RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil - # Damerau Levenshtein + bool JaroMultiStringSupport(const RF_Kwargs*) nogil + + # Jaro Winkler double jaro_winkler_normalized_distance_func( const RF_String&, const RF_String&, double, double) except + nogil double jaro_winkler_distance_func( const RF_String&, const RF_String&, double, double) except + nogil double jaro_winkler_normalized_similarity_func(const RF_String&, const RF_String&, double, double) except + nogil @@ -151,6 +153,8 @@ cdef extern from "metrics.hpp": bool JaroWinklerSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil bool JaroWinklerNormalizedSimilarityInit(RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil + bool JaroWinklerMultiStringSupport(const RF_Kwargs*) nogil + # Prefix double prefix_normalized_distance_func( const RF_String&, const RF_String&, double) except + nogil int64_t prefix_distance_func( const RF_String&, const RF_String&, int64_t) except + nogil @@ -879,12 +883,18 @@ def jaro_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): cdef bool GetScorerFlagsJaroDistance(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil: scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE + if JaroMultiStringSupport(self): + scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT + scorer_flags.optimal_score.f64 = 0.0 scorer_flags.worst_score.f64 = 1.0 return True cdef bool GetScorerFlagsJaroSimilarity(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil: scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE + if JaroMultiStringSupport(self): + scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT + scorer_flags.optimal_score.f64 = 1.0 scorer_flags.worst_score.f64 = 0 return True @@ -951,12 +961,18 @@ cdef bool JaroWinklerKwargsInit(RF_Kwargs * self, dict kwargs) except False: cdef bool GetScorerFlagsJaroWinklerDistance(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil: scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE + if JaroWinklerMultiStringSupport(self): + scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT + scorer_flags.optimal_score.f64 = 0.0 scorer_flags.worst_score.f64 = 1.0 return True cdef bool GetScorerFlagsJaroWinklerSimilarity(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil: scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE + if JaroWinklerMultiStringSupport(self): + scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT + scorer_flags.optimal_score.f64 = 1.0 scorer_flags.worst_score.f64 = 0 return True