Skip to content

Commit

Permalink
fix edge case in jaro simd implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 8, 2023
1 parent a2cc696 commit d13f1f7
Show file tree
Hide file tree
Showing 12 changed files with 162 additions and 81 deletions.
1 change: 0 additions & 1 deletion bench/bench-jarowinkler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
return output;
}


static void BM_JaroLongSimilarSequence(benchmark::State& state)
{
size_t len = state.range(0);
Expand Down
17 changes: 9 additions & 8 deletions extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2023-10-08 18:06:53.104764
// Generated: 2023-10-08 19:49:45.137761
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -5438,13 +5438,10 @@ static inline int64_t jaro_bounds(int64_t P_len, int64_t T_len)
/* since jaro uses a sliding window some parts of T/P might never be in
* range an can be removed ahead of time
*/
int64_t Bound = 0;
if (T_len > P_len) {
Bound = T_len / 2 - 1;
}
else {
Bound = P_len / 2 - 1;
}
int64_t Bound = (T_len > P_len) ? T_len : P_len;
Bound /= 2;
if (Bound > 0) Bound--;

return Bound;
}

Expand All @@ -5457,6 +5454,10 @@ int64_t jaro_bounds(Range<InputIt1>& P, Range<InputIt2>& T)
int64_t P_len = P.size();
int64_t T_len = T.size();

// this is currently an early exit condition
// if this is changed handle this below, so Bound is never below 0
assert(P_len != 0 || T_len != 0);

/* since jaro uses a sliding window some parts of T/P might never be in
* range an can be removed ahead of time
*/
Expand Down
50 changes: 50 additions & 0 deletions fuzzing/fuzz_jaro_similarity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,51 @@ bool is_close(double a, double b, double epsilon)
return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
}

template <size_t MaxLen>
void validate_simd(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2)
{
#ifdef RAPIDFUZZ_SIMD
size_t count = s1.size() / MaxLen + ((s1.size() % MaxLen) != 0);
if (count == 0) return;

rapidfuzz::experimental::MultiJaro<MaxLen> scorer(count);

std::vector<std::basic_string<uint8_t>> strings;

for (auto it1 = s1.begin(); it1 != s1.end(); it1 += MaxLen) {
if (std::distance(it1, s1.end()) < static_cast<ptrdiff_t>(MaxLen)) {
strings.emplace_back(it1, s1.end());
break;
}
else {
strings.emplace_back(it1, it1 + MaxLen);
}
}

for (const auto& s : strings)
scorer.insert(s);

std::vector<double> simd_results(scorer.result_count());
scorer.similarity(&simd_results[0], simd_results.size(), s2);

for (size_t i = 0; i < strings.size(); ++i) {
double reference_sim = rapidfuzz_reference::jaro_similarity(strings[i], s2);

if (!is_close(simd_results[i], reference_sim, 0.0001)) {
print_seq("s1", s1);
print_seq("s2", s2);
throw std::logic_error(std::string("jaro similarity using simd failed (reference_score = ") +
std::to_string(reference_sim) + std::string(", score = ") +
std::to_string(simd_results[i]) + ")");
}
}

#else
(void)s1;
(void)s2;
#endif
}

void validate_distance(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2)
{
double reference_sim = rapidfuzz_reference::jaro_similarity(s1, s2);
Expand All @@ -25,6 +70,11 @@ void validate_distance(const std::basic_string<uint8_t>& s1, const std::basic_st
std::to_string(reference_sim) + std::string(", score = ") +
std::to_string(sim) + ")");
}

validate_simd<8>(s1, s2);
validate_simd<16>(s1, s2);
validate_simd<32>(s1, s2);
validate_simd<64>(s1, s2);
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
Expand Down
3 changes: 2 additions & 1 deletion fuzzing/fuzz_levenshtein_editops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#include <stdexcept>
#include <string>

void validate_editops(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2, int64_t score, int64_t score_hint = std::numeric_limits<int64_t>::max())
void validate_editops(const std::basic_string<uint8_t>& s1, const std::basic_string<uint8_t>& s2,
int64_t score, int64_t score_hint = std::numeric_limits<int64_t>::max())
{
rapidfuzz::Editops ops = rapidfuzz::levenshtein_editops(s1, s2, score_hint);
if (static_cast<int64_t>(ops.size()) == score && s2 != rapidfuzz::editops_apply<uint8_t>(ops, s1, s2))
Expand Down
16 changes: 9 additions & 7 deletions rapidfuzz/distance/Jaro_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,13 +321,11 @@ static inline int64_t jaro_bounds(int64_t P_len, int64_t T_len)
/* since jaro uses a sliding window some parts of T/P might never be in
* range an can be removed ahead of time
*/
int64_t Bound = 0;
if (T_len > P_len) {
Bound = T_len / 2 - 1;
}
else {
Bound = P_len / 2 - 1;
}
int64_t Bound = (T_len > P_len) ? T_len : P_len;
Bound /= 2;
if(Bound > 0)
Bound--;

return Bound;
}

Expand All @@ -340,6 +338,10 @@ int64_t jaro_bounds(Range<InputIt1>& P, Range<InputIt2>& T)
int64_t P_len = P.size();
int64_t T_len = T.size();

// this is currently an early exit condition
// if this is changed handle this below, so Bound is never below 0
assert(P_len != 0 || T_len != 0);

/* since jaro uses a sliding window some parts of T/P might never be in
* range an can be removed ahead of time
*/
Expand Down
10 changes: 10 additions & 0 deletions test/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,13 @@ class BidirectionalIterWrapper {
private:
T iter;
};

template <typename T>
std::basic_string<T> str_multiply(std::basic_string<T> a, size_t b)
{
std::basic_string<T> output;
while (b--)
output += a;

return output;
}
10 changes: 0 additions & 10 deletions test/distance/tests-DamerauLevenshtein.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,6 @@

#include "../common.hpp"

template <typename T>
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
{
std::basic_string<T> output;
while (b--)
output += a;

return output;
}

template <typename Sentence1, typename Sentence2>
int64_t damerau_levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
int64_t max = std::numeric_limits<int64_t>::max())
Expand Down
53 changes: 41 additions & 12 deletions test/distance/tests-Jaro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <catch2/catch_test_macros.hpp>
#include <rapidfuzz/distance/Jaro.hpp>

#include "../common.hpp"

using Catch::Approx;

template <typename Sentence1, typename Sentence2>
Expand Down Expand Up @@ -80,10 +82,25 @@ double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cuto
return res1;
}

/**
* @name JaroWinklerFlagCharsTest
*/
TEST_CASE("JaroWinklerTest")
template <typename Sentence1, typename Sentence2>
double jaro_sim_test(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
{
INFO("name1: " << s1 << ", name2: " << s2 << ", score_cutoff: " << score_cutoff);
double Sim_original = rapidfuzz_reference::jaro_similarity(s1, s2, score_cutoff);
double Sim_bitparallel = jaro_similarity(s1, s2, score_cutoff);
double Dist_bitparallel = jaro_distance(s1, s2, 1.0 - score_cutoff);
double Sim_bitparallel2 = jaro_similarity(s2, s1, score_cutoff);
double Dist_bitparallel2 = jaro_distance(s2, s1, 1.0 - score_cutoff);


REQUIRE(Sim_original == Approx(Sim_bitparallel));
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
REQUIRE(Sim_original == Approx(Sim_bitparallel2));
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel2));
return Sim_original;
}

TEST_CASE("JaroTest")
{
std::array<std::string, 20> names = {"james", "robert", "john", "michael", "william",
"david", "joseph", "thomas", "charles", "mary",
Expand All @@ -96,14 +113,26 @@ TEST_CASE("JaroWinklerTest")

for (double score_cutoff : score_cutoffs)
for (const auto& name1 : names)
for (const auto& name2 : names) {
INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff);
double Sim_original = rapidfuzz_reference::jaro_similarity(name1, name2, score_cutoff);
double Sim_bitparallel = jaro_similarity(name1, name2, score_cutoff);
double Dist_bitparallel = jaro_distance(name1, name2, 1.0 - score_cutoff);
for (const auto& name2 : names)
jaro_sim_test(name1, name2, score_cutoff);
}

SECTION("testEdgeCaseLengths")
{
REQUIRE(jaro_sim_test(std::string(""), std::string("")) == Approx(1));
REQUIRE(jaro_sim_test(std::string("0"), std::string("0")) == Approx(1));
REQUIRE(jaro_sim_test(std::string("00"), std::string("00")) == Approx(1));
REQUIRE(jaro_sim_test(std::string("0"), std::string("00")) == Approx(0.833333));

REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 65), str_multiply(std::string("0"), 65)) == Approx(1));
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.994872));
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.989744));

REQUIRE(Sim_original == Approx(Sim_bitparallel));
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
}
REQUIRE(jaro_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
REQUIRE(jaro_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.922233));
REQUIRE(jaro_sim_test(
std::string("00000000000000000000000000000000000000000000000000000000000000000"),
std::string("01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000")
) == Approx(0.8359375));
}
}
53 changes: 41 additions & 12 deletions test/distance/tests-JaroWinkler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <catch2/catch_test_macros.hpp>
#include <rapidfuzz/distance/JaroWinkler.hpp>

#include "../common.hpp"

using Catch::Approx;

template <typename Sentence1, typename Sentence2>
Expand Down Expand Up @@ -86,9 +88,24 @@ double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double pr
return res1;
}

/**
* @name JaroWinklerFlagCharsTest
*/
template <typename Sentence1, typename Sentence2>
double jaro_winkler_sim_test(const Sentence1& s1, const Sentence2& s2, double score_cutoff = 0.0)
{
INFO("name1: " << s1 << ", name2: " << s2 << ", score_cutoff: " << score_cutoff);
double Sim_original = rapidfuzz_reference::jaro_winkler_similarity(s1, s2, 0.1, score_cutoff);
double Sim_bitparallel = jaro_winkler_similarity(s1, s2, 0.1, score_cutoff);
double Dist_bitparallel = jaro_winkler_distance(s1, s2, 0.1, 1.0 - score_cutoff);
double Sim_bitparallel2 = jaro_winkler_similarity(s2, s1, 0.1, score_cutoff);
double Dist_bitparallel2 = jaro_winkler_distance(s2, s1, 0.1, 1.0 - score_cutoff);


REQUIRE(Sim_original == Approx(Sim_bitparallel));
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
REQUIRE(Sim_original == Approx(Sim_bitparallel2));
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel2));
return Sim_original;
}

TEST_CASE("JaroWinklerTest")
{
std::array<std::string, 22> names = {"james", "robert", "john", "michael", "william",
Expand All @@ -103,14 +120,26 @@ TEST_CASE("JaroWinklerTest")

for (double score_cutoff : score_cutoffs)
for (const auto& name1 : names)
for (const auto& name2 : names) {
INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff);
double Sim_original =
rapidfuzz_reference::jaro_winkler_similarity(name1, name2, 0.1, score_cutoff);
double Sim_bitparallel = jaro_winkler_similarity(name1, name2, 0.1, score_cutoff);
double Dist_bitparallel = jaro_winkler_distance(name1, name2, 0.1, 1.0 - score_cutoff);
REQUIRE(Sim_original == Approx(Sim_bitparallel));
REQUIRE((1.0 - Sim_original) == Approx(Dist_bitparallel));
}
for (const auto& name2 : names)
jaro_winkler_sim_test(name1, name2, score_cutoff);
}

SECTION("testEdgeCaseLengths")
{
REQUIRE(jaro_winkler_sim_test(std::string(""), std::string("")) == Approx(1));
REQUIRE(jaro_winkler_sim_test(std::string("0"), std::string("0")) == Approx(1));
REQUIRE(jaro_winkler_sim_test(std::string("00"), std::string("00")) == Approx(1));
REQUIRE(jaro_winkler_sim_test(std::string("0"), std::string("00")) == Approx(0.85));

REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 65), str_multiply(std::string("0"), 65)) == Approx(1));
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.996923));
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.993846));

REQUIRE(jaro_winkler_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
REQUIRE(jaro_winkler_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.95334));
REQUIRE(jaro_winkler_sim_test(
std::string("00000000000000000000000000000000000000000000000000000000000000000"),
std::string("01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000")
) == Approx(0.852344));
}
}
10 changes: 0 additions & 10 deletions test/distance/tests-LCSseq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,6 @@

#include "../common.hpp"

template <typename T>
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
{
std::basic_string<T> output;
while (b--)
output += a;

return output;
}

template <typename Sentence1, typename Sentence2>
int64_t lcs_seq_distance(const Sentence1& s1, const Sentence2& s2,
int64_t max = std::numeric_limits<int64_t>::max())
Expand Down
10 changes: 0 additions & 10 deletions test/distance/tests-Levenshtein.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,6 @@

#include "../common.hpp"

template <typename T>
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
{
std::basic_string<T> output;
while (b--)
output += a;

return output;
}

template <typename Sentence1, typename Sentence2>
int64_t levenshtein_distance(const Sentence1& s1, const Sentence2& s2,
rapidfuzz::LevenshteinWeightTable weights = {1, 1, 1},
Expand Down
10 changes: 0 additions & 10 deletions test/distance/tests-OSA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,6 @@

#include "../common.hpp"

template <typename T>
std::basic_string<T> str_multiply(std::basic_string<T> a, unsigned int b)
{
std::basic_string<T> output;
while (b--)
output += a;

return output;
}

template <typename Sentence1, typename Sentence2>
int64_t osa_distance(const Sentence1& s1, const Sentence2& s2,
int64_t max = std::numeric_limits<int64_t>::max())
Expand Down

0 comments on commit d13f1f7

Please sign in to comment.