diff --git a/bench/bench-jarowinkler.cpp b/bench/bench-jarowinkler.cpp index 001cb031..26f07375 100644 --- a/bench/bench-jarowinkler.cpp +++ b/bench/bench-jarowinkler.cpp @@ -28,7 +28,6 @@ std::basic_string str_multiply(std::basic_string a, unsigned int b) return output; } - static void BM_JaroLongSimilarSequence(benchmark::State& state) { size_t len = state.range(0); diff --git a/extras/rapidfuzz_amalgamated.hpp b/extras/rapidfuzz_amalgamated.hpp index ec44795d..e6fedf9b 100644 --- a/extras/rapidfuzz_amalgamated.hpp +++ b/extras/rapidfuzz_amalgamated.hpp @@ -1,7 +1,7 @@ // Licensed under the MIT License . // SPDX-License-Identifier: MIT // RapidFuzz v1.0.2 -// Generated: 2023-10-08 18:06:53.104764 +// Generated: 2023-10-08 19:49:45.137761 // ---------------------------------------------------------- // This file is an amalgamation of multiple different files. // You probably shouldn't edit it directly. @@ -5438,13 +5438,10 @@ static inline int64_t jaro_bounds(int64_t P_len, int64_t T_len) /* since jaro uses a sliding window some parts of T/P might never be in * range an can be removed ahead of time */ - int64_t Bound = 0; - if (T_len > P_len) { - Bound = T_len / 2 - 1; - } - else { - Bound = P_len / 2 - 1; - } + int64_t Bound = (T_len > P_len) ? T_len : P_len; + Bound /= 2; + if (Bound > 0) Bound--; + return Bound; } @@ -5457,6 +5454,10 @@ int64_t jaro_bounds(Range& P, Range& T) int64_t P_len = P.size(); int64_t T_len = T.size(); + // this is currently an early exit condition + // if this is changed handle this below, so Bound is never below 0 + assert(P_len != 0 || T_len != 0); + /* since jaro uses a sliding window some parts of T/P might never be in * range an can be removed ahead of time */ diff --git a/fuzzing/fuzz_jaro_similarity.cpp b/fuzzing/fuzz_jaro_similarity.cpp index c3816e03..86f66473 100644 --- a/fuzzing/fuzz_jaro_similarity.cpp +++ b/fuzzing/fuzz_jaro_similarity.cpp @@ -13,6 +13,51 @@ bool is_close(double a, double b, double epsilon) return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon); } +template +void validate_simd(const std::basic_string& s1, const std::basic_string& s2) +{ +#ifdef RAPIDFUZZ_SIMD + size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0); + if (count == 0) return; + + rapidfuzz::experimental::MultiJaro scorer(count); + + std::vector> strings; + + for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) { + if (std::distance(it1, s1.end()) < static_cast(MaxLen)) { + strings.emplace_back(it1, s1.end()); + break; + } + else { + strings.emplace_back(it1, it1 + MaxLen); + } + } + + for (const auto& s : strings) + scorer.insert(s); + + std::vector simd_results(scorer.result_count()); + scorer.similarity(&simd_results[0], simd_results.size(), s2); + + for (size_t i = 0; i < strings.size(); ++i) { + double reference_sim = rapidfuzz_reference::jaro_similarity(strings[i], s2); + + if (!is_close(simd_results[i], reference_sim, 0.0001)) { + print_seq("s1", s1); + print_seq("s2", s2); + throw std::logic_error(std::string("jaro similarity using simd failed (reference_score = ") + + std::to_string(reference_sim) + std::string(", score = ") + + std::to_string(simd_results[i]) + ")"); + } + } + +#else + (void)s1; + (void)s2; +#endif +} + void validate_distance(const std::basic_string& s1, const std::basic_string& s2) { double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2); @@ -25,6 +70,11 @@ void validate_distance(const std::basic_string& s1, const std::basic_st std::to_string(reference_sim) + std::string(", score = ") + std::to_string(sim) + ")"); } + + validate_simd<8>(s1, s2); + validate_simd<16>(s1, s2); + validate_simd<32>(s1, s2); + validate_simd<64>(s1, s2); } extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) diff --git a/fuzzing/fuzz_levenshtein_editops.cpp b/fuzzing/fuzz_levenshtein_editops.cpp index 8c9d2078..cccad4b4 100644 --- a/fuzzing/fuzz_levenshtein_editops.cpp +++ b/fuzzing/fuzz_levenshtein_editops.cpp @@ -7,7 +7,8 @@ #include #include -void validate_editops(const std::basic_string& s1, const std::basic_string& s2, int64_t score, int64_t score_hint = std::numeric_limits::max()) +void validate_editops(const std::basic_string& s1, const std::basic_string& s2, + int64_t score, int64_t score_hint = std::numeric_limits::max()) { rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint); if (static_cast(ops.size()) == score && s2 != rapidfuzz::editops_apply(ops, s1, s2)) diff --git a/rapidfuzz/distance/Jaro_impl.hpp b/rapidfuzz/distance/Jaro_impl.hpp index b4db9929..201639eb 100644 --- a/rapidfuzz/distance/Jaro_impl.hpp +++ b/rapidfuzz/distance/Jaro_impl.hpp @@ -321,13 +321,11 @@ static inline int64_t jaro_bounds(int64_t P_len, int64_t T_len) /* since jaro uses a sliding window some parts of T/P might never be in * range an can be removed ahead of time */ - int64_t Bound = 0; - if (T_len > P_len) { - Bound = T_len / 2 - 1; - } - else { - Bound = P_len / 2 - 1; - } + int64_t Bound = (T_len > P_len) ? T_len : P_len; + Bound /= 2; + if(Bound > 0) + Bound--; + return Bound; } @@ -340,6 +338,10 @@ int64_t jaro_bounds(Range& P, Range& T) int64_t P_len = P.size(); int64_t T_len = T.size(); + // this is currently an early exit condition + // if this is changed handle this below, so Bound is never below 0 + assert(P_len != 0 || T_len != 0); + /* since jaro uses a sliding window some parts of T/P might never be in * range an can be removed ahead of time */ diff --git a/test/common.hpp b/test/common.hpp index 9dddb552..238e2055 100644 --- a/test/common.hpp +++ b/test/common.hpp @@ -57,3 +57,13 @@ class BidirectionalIterWrapper { private: T iter; }; + +template +std::basic_string str_multiply(std::basic_string a, size_t b) +{ + std::basic_string output; + while (b--) + output += a; + + return output; +} diff --git a/test/distance/tests-DamerauLevenshtein.cpp b/test/distance/tests-DamerauLevenshtein.cpp index b2eced88..4a72db0d 100644 --- a/test/distance/tests-DamerauLevenshtein.cpp +++ b/test/distance/tests-DamerauLevenshtein.cpp @@ -8,16 +8,6 @@ #include "../common.hpp" -template -std::basic_string str_multiply(std::basic_string a, unsigned int b) -{ - std::basic_string output; - while (b--) - output += a; - - return output; -} - template int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2, int64_t max = std::numeric_limits::max()) diff --git a/test/distance/tests-Jaro.cpp b/test/distance/tests-Jaro.cpp index e37ddac5..b800f79a 100644 --- a/test/distance/tests-Jaro.cpp +++ b/test/distance/tests-Jaro.cpp @@ -3,6 +3,8 @@ #include #include +#include "../common.hpp" + using Catch::Approx; template @@ -80,10 +82,25 @@ double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cuto return res1; } -/** - * @name JaroWinklerFlagCharsTest - */ -TEST_CASE("JaroWinklerTest") +template +double jaro_sim_test(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) +{ + INFO("name1: " << s1 << ", name2: " << s2 << ", score_cutoff: " << score_cutoff); + double Sim_original = rapidfuzz_reference::jaro_similarity(s1, s2, score_cutoff); + double Sim_bitparallel = jaro_similarity(s1, s2, score_cutoff); + double Dist_bitparallel = jaro_distance(s1, s2, 1.0 - score_cutoff); + double Sim_bitparallel2 = jaro_similarity(s2, s1, score_cutoff); + double Dist_bitparallel2 = jaro_distance(s2, s1, 1.0 - score_cutoff); + + + REQUIRE(Sim_original == Approx(Sim_bitparallel)); + REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel)); + REQUIRE(Sim_original == Approx(Sim_bitparallel2)); + REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel2)); + return Sim_original; +} + +TEST_CASE("JaroTest") { std::array names = {"james", "robert", "john", "michael", "william", "david", "joseph", "thomas", "charles", "mary", @@ -96,14 +113,26 @@ TEST_CASE("JaroWinklerTest") for (double score_cutoff : score_cutoffs) for (const auto& name1 : names) - for (const auto& name2 : names) { - INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff); - double Sim_original = rapidfuzz_reference::jaro_similarity(name1, name2, score_cutoff); - double Sim_bitparallel = jaro_similarity(name1, name2, score_cutoff); - double Dist_bitparallel = jaro_distance(name1, name2, 1.0 - score_cutoff); + for (const auto& name2 : names) + jaro_sim_test(name1, name2, score_cutoff); + } + + SECTION("testEdgeCaseLengths") + { + REQUIRE(jaro_sim_test(std::string(""), std::string("")) == Approx(1)); + REQUIRE(jaro_sim_test(std::string("0"), std::string("0")) == Approx(1)); + REQUIRE(jaro_sim_test(std::string("00"), std::string("00")) == Approx(1)); + REQUIRE(jaro_sim_test(std::string("0"), std::string("00")) == Approx(0.833333)); + + REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 65), str_multiply(std::string("0"), 65)) == Approx(1)); + REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.994872)); + REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.989744)); - REQUIRE(Sim_original == Approx(Sim_bitparallel)); - REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel)); - } + REQUIRE(jaro_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487)); + REQUIRE(jaro_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.922233)); + REQUIRE(jaro_sim_test( + std::string("00000000000000000000000000000000000000000000000000000000000000000"), + std::string("01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000") + ) == Approx(0.8359375)); } } \ No newline at end of file diff --git a/test/distance/tests-JaroWinkler.cpp b/test/distance/tests-JaroWinkler.cpp index c3e48e1d..847ed35c 100644 --- a/test/distance/tests-JaroWinkler.cpp +++ b/test/distance/tests-JaroWinkler.cpp @@ -3,6 +3,8 @@ #include #include +#include "../common.hpp" + using Catch::Approx; template @@ -86,9 +88,24 @@ double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double pr return res1; } -/** - * @name JaroWinklerFlagCharsTest - */ +template +double jaro_winkler_sim_test(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0) +{ + INFO("name1: " << s1 << ", name2: " << s2 << ", score_cutoff: " << score_cutoff); + double Sim_original = rapidfuzz_reference::jaro_winkler_similarity(s1, s2, 0.1, score_cutoff); + double Sim_bitparallel = jaro_winkler_similarity(s1, s2, 0.1, score_cutoff); + double Dist_bitparallel = jaro_winkler_distance(s1, s2, 0.1, 1.0 - score_cutoff); + double Sim_bitparallel2 = jaro_winkler_similarity(s2, s1, 0.1, score_cutoff); + double Dist_bitparallel2 = jaro_winkler_distance(s2, s1, 0.1, 1.0 - score_cutoff); + + + REQUIRE(Sim_original == Approx(Sim_bitparallel)); + REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel)); + REQUIRE(Sim_original == Approx(Sim_bitparallel2)); + REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel2)); + return Sim_original; +} + TEST_CASE("JaroWinklerTest") { std::array names = {"james", "robert", "john", "michael", "william", @@ -103,14 +120,26 @@ TEST_CASE("JaroWinklerTest") for (double score_cutoff : score_cutoffs) for (const auto& name1 : names) - for (const auto& name2 : names) { - INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff); - double Sim_original = - rapidfuzz_reference::jaro_winkler_similarity(name1, name2, 0.1, score_cutoff); - double Sim_bitparallel = jaro_winkler_similarity(name1, name2, 0.1, score_cutoff); - double Dist_bitparallel = jaro_winkler_distance(name1, name2, 0.1, 1.0 - score_cutoff); - REQUIRE(Sim_original == Approx(Sim_bitparallel)); - REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel)); - } + for (const auto& name2 : names) + jaro_winkler_sim_test(name1, name2, score_cutoff); + } + + SECTION("testEdgeCaseLengths") + { + REQUIRE(jaro_winkler_sim_test(std::string(""), std::string("")) == Approx(1)); + REQUIRE(jaro_winkler_sim_test(std::string("0"), std::string("0")) == Approx(1)); + REQUIRE(jaro_winkler_sim_test(std::string("00"), std::string("00")) == Approx(1)); + REQUIRE(jaro_winkler_sim_test(std::string("0"), std::string("00")) == Approx(0.85)); + + REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 65), str_multiply(std::string("0"), 65)) == Approx(1)); + REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.996923)); + REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.993846)); + + REQUIRE(jaro_winkler_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487)); + REQUIRE(jaro_winkler_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.95334)); + REQUIRE(jaro_winkler_sim_test( + std::string("00000000000000000000000000000000000000000000000000000000000000000"), + std::string("01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000") + ) == Approx(0.852344)); } } \ No newline at end of file diff --git a/test/distance/tests-LCSseq.cpp b/test/distance/tests-LCSseq.cpp index 1d09ee85..5d47f25a 100644 --- a/test/distance/tests-LCSseq.cpp +++ b/test/distance/tests-LCSseq.cpp @@ -7,16 +7,6 @@ #include "../common.hpp" -template -std::basic_string str_multiply(std::basic_string a, unsigned int b) -{ - std::basic_string output; - while (b--) - output += a; - - return output; -} - template int64_t lcs_seq_distance(const Sentence1& s1, const Sentence2& s2, int64_t max = std::numeric_limits::max()) diff --git a/test/distance/tests-Levenshtein.cpp b/test/distance/tests-Levenshtein.cpp index e60f3915..6b80c123 100644 --- a/test/distance/tests-Levenshtein.cpp +++ b/test/distance/tests-Levenshtein.cpp @@ -10,16 +10,6 @@ #include "../common.hpp" -template -std::basic_string str_multiply(std::basic_string a, unsigned int b) -{ - std::basic_string output; - while (b--) - output += a; - - return output; -} - template int64_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2, rapidfuzz::LevenshteinWeightTable weights = {1, 1, 1}, diff --git a/test/distance/tests-OSA.cpp b/test/distance/tests-OSA.cpp index 6e1b3494..2fad2517 100644 --- a/test/distance/tests-OSA.cpp +++ b/test/distance/tests-OSA.cpp @@ -6,16 +6,6 @@ #include "../common.hpp" -template -std::basic_string str_multiply(std::basic_string a, unsigned int b) -{ - std::basic_string output; - while (b--) - output += a; - - return output; -} - template int64_t osa_distance(const Sentence1& s1, const Sentence2& s2, int64_t max = std::numeric_limits::max())