Skip to content

Commit

Permalink
add simd implementation of Jaro and JaroWinkler
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 8, 2023
1 parent 10efe27 commit 67a0933
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 4 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changelog
---------

[3.4.0] - 2023-10-
^^^^^^^^^^^^^^^^^^^^
Changed
~~~~~~~
- add simd implementation for Jaro and Jaro Winkler

[3.3.1] - 2023-09-25
^^^^^^^^^^^^^^^^^^^^
Added
Expand Down
2 changes: 1 addition & 1 deletion extern/rapidfuzz-cpp
2 changes: 1 addition & 1 deletion extern/taskflow
Submodule taskflow updated 1034 files
70 changes: 70 additions & 0 deletions src/rapidfuzz/distance/metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,12 @@ static inline double jaro_distance_func(const RF_String& str1, const RF_String&
static inline bool JaroDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count,
const RF_String* str)
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_distance_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
#endif

return distance_init<rf::CachedJaro, double>(self, str_count, str);
}

Expand All @@ -526,6 +532,12 @@ static inline double jaro_normalized_distance_func(const RF_String& str1, const
static inline bool JaroNormalizedDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count,
const RF_String* str)
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_distance_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
#endif

return normalized_distance_init<rf::CachedJaro, double>(self, str_count, str);
}

Expand All @@ -538,6 +550,12 @@ static inline double jaro_similarity_func(const RF_String& str1, const RF_String
static inline bool JaroSimilarityInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count,
const RF_String* str)
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_similarity_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
#endif

return similarity_init<rf::CachedJaro, double>(self, str_count, str);
}

Expand All @@ -551,9 +569,24 @@ static inline double jaro_normalized_similarity_func(const RF_String& str1, cons
static inline bool JaroNormalizedSimilarityInit(RF_ScorerFunc* self, const RF_Kwargs*, int64_t str_count,
const RF_String* str)
{
#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_similarity_init<rf::experimental::MultiJaro, double>(
self, str_count, str);
#endif

return normalized_similarity_init<rf::CachedJaro, double>(self, str_count, str);
}

static inline bool JaroMultiStringSupport(const RF_Kwargs*)
{
#ifdef RAPIDFUZZ_X64
return true;
#else
return false;
#endif
}

/* JaroWinkler */
static inline double jaro_winkler_distance_func(const RF_String& str1, const RF_String& str2,
double prefix_weight, double score_cutoff)
Expand All @@ -566,6 +599,13 @@ static inline bool JaroWinklerDistanceInit(RF_ScorerFunc* self, const RF_Kwargs*
const RF_String* str)
{
double prefix_weight = *static_cast<double*>(kwargs->context);

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_distance_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
#endif

return distance_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
}

Expand All @@ -580,6 +620,13 @@ static inline bool JaroWinklerNormalizedDistanceInit(RF_ScorerFunc* self, const
int64_t str_count, const RF_String* str)
{
double prefix_weight = *static_cast<double*>(kwargs->context);

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_distance_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
#endif

return normalized_distance_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
}

Expand All @@ -594,6 +641,13 @@ static inline bool JaroWinklerSimilarityInit(RF_ScorerFunc* self, const RF_Kwarg
const RF_String* str)
{
double prefix_weight = *static_cast<double*>(kwargs->context);

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_similarity_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
#endif

return similarity_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
}

Expand All @@ -608,9 +662,25 @@ static inline bool JaroWinklerNormalizedSimilarityInit(RF_ScorerFunc* self, cons
int64_t str_count, const RF_String* str)
{
double prefix_weight = *static_cast<double*>(kwargs->context);

#ifdef RAPIDFUZZ_X64
if (str_count != 1)
return multi_normalized_similarity_init<rf::experimental::MultiJaroWinkler, double>(
self, str_count, str, prefix_weight);
#endif

return normalized_similarity_init<rf::CachedJaroWinkler, double>(self, str_count, str, prefix_weight);
}

static inline bool JaroWinklerMultiStringSupport(const RF_Kwargs*)
{
#ifdef RAPIDFUZZ_X64
return true;
#else
return false;
#endif
}

/* Prefix */
static inline int64_t prefix_distance_func(const RF_String& str1, const RF_String& str2, int64_t score_cutoff)
{
Expand Down
20 changes: 18 additions & 2 deletions src/rapidfuzz/distance/metrics_cpp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ cdef extern from "metrics.hpp":

bool OSAMultiStringSupport(const RF_Kwargs*) nogil

# Damerau Levenshtein
# Jaro
double jaro_normalized_distance_func( const RF_String&, const RF_String&, double) except + nogil
double jaro_distance_func( const RF_String&, const RF_String&, double) except + nogil
double jaro_normalized_similarity_func(const RF_String&, const RF_String&, double) except + nogil
Expand All @@ -140,7 +140,9 @@ cdef extern from "metrics.hpp":
bool JaroSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil
bool JaroNormalizedSimilarityInit(RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil

# Damerau Levenshtein
bool JaroMultiStringSupport(const RF_Kwargs*) nogil

# Jaro Winkler
double jaro_winkler_normalized_distance_func( const RF_String&, const RF_String&, double, double) except + nogil
double jaro_winkler_distance_func( const RF_String&, const RF_String&, double, double) except + nogil
double jaro_winkler_normalized_similarity_func(const RF_String&, const RF_String&, double, double) except + nogil
Expand All @@ -151,6 +153,8 @@ cdef extern from "metrics.hpp":
bool JaroWinklerSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil
bool JaroWinklerNormalizedSimilarityInit(RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) except False nogil

bool JaroWinklerMultiStringSupport(const RF_Kwargs*) nogil

# Prefix
double prefix_normalized_distance_func( const RF_String&, const RF_String&, double) except + nogil
int64_t prefix_distance_func( const RF_String&, const RF_String&, int64_t) except + nogil
Expand Down Expand Up @@ -879,12 +883,18 @@ def jaro_normalized_similarity(s1, s2, *, processor=None, score_cutoff=None):

cdef bool GetScorerFlagsJaroDistance(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil:
scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE
if JaroMultiStringSupport(self):
scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT

scorer_flags.optimal_score.f64 = 0.0
scorer_flags.worst_score.f64 = 1.0
return True

cdef bool GetScorerFlagsJaroSimilarity(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil:
scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE
if JaroMultiStringSupport(self):
scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT

scorer_flags.optimal_score.f64 = 1.0
scorer_flags.worst_score.f64 = 0
return True
Expand Down Expand Up @@ -951,12 +961,18 @@ cdef bool JaroWinklerKwargsInit(RF_Kwargs * self, dict kwargs) except False:

cdef bool GetScorerFlagsJaroWinklerDistance(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil:
scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE
if JaroWinklerMultiStringSupport(self):
scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT

scorer_flags.optimal_score.f64 = 0.0
scorer_flags.worst_score.f64 = 1.0
return True

cdef bool GetScorerFlagsJaroWinklerSimilarity(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) except False nogil:
scorer_flags.flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC | RF_SCORER_NONE_IS_WORST_SCORE
if JaroWinklerMultiStringSupport(self):
scorer_flags.flags |= RF_SCORER_FLAG_MULTI_STRING_INIT

scorer_flags.optimal_score.f64 = 1.0
scorer_flags.worst_score.f64 = 0
return True
Expand Down

0 comments on commit 67a0933

Please sign in to comment.