Skip to content

Commit

Permalink
fix edge case in simd implementation of Jaro/JaroWinkler
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 8, 2023
1 parent 74551df commit 7a30d6b
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Changelog

## [2.1.1] - 2023-10-08
### Fixed
- fix edge case in new simd implementation of Jaro and Jaro Winkler

## [2.1.0] - 2023-10-08
### Changed
- add support for bidirectional iterators
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
endif()

project(rapidfuzz LANGUAGES CXX VERSION 2.1.0)
project(rapidfuzz LANGUAGES CXX VERSION 2.1.1)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
include(GNUInstallDirs)
Expand Down
25 changes: 22 additions & 3 deletions extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2023-10-08 19:49:45.137761
// Generated: 2023-10-08 21:27:21.591281
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -5632,9 +5632,15 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch

native_simd<VecType> counter(VecType(1));

for (const auto& ch : s2_cur) {
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
// would incorrectly always set the first bit to 1.
// this is solved by splitting the loop into two parts where after this boundary is reached
// the first bit inside boundMask is no longer set
int64_t j = 0;
for (; j < maxBound; ++j) {
alignas(32) std::array<uint64_t, vecs> stored;
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, ch); });
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
native_simd<VecType> X(stored.data());
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);

Expand All @@ -5645,6 +5651,19 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
boundMask = (boundMask << 1) | ((boundMask <= boundMaskSize) & one);
}

for (; j < s2_cur.size(); ++j) {
alignas(32) std::array<uint64_t, vecs> stored;
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
native_simd<VecType> X(stored.data());
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);

P_flag |= blsi(PM_j);
T_flag |= andnot(counter, (PM_j == zero));

counter = counter << 1;
boundMask = boundMask << 1;
}

auto counts = popcount(P_flag);
alignas(32) std::array<VecType, vec_width> P_flags;
P_flag.store(P_flags.data());
Expand Down
25 changes: 23 additions & 2 deletions rapidfuzz/distance/Jaro_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,9 +516,16 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch

native_simd<VecType> counter(VecType(1));

for (const auto& ch : s2_cur) {
// In case s2 is longer than all of the elements in s1_lengths boundMaskSize
// might have all bits set and therefor the condition ((boundMask <= boundMaskSize) & one)
// would incorrectly always set the first bit to 1.
// this is solved by splitting the loop into two parts where after this boundary is reached
// the first bit inside boundMask is no longer set
int64_t j = 0;
for(; j < maxBound; ++j)
{
alignas(32) std::array<uint64_t, vecs> stored;
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, ch); });
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
native_simd<VecType> X(stored.data());
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);

Expand All @@ -529,6 +536,20 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
boundMask = (boundMask << 1) | ((boundMask <= boundMaskSize) & one);
}

for(; j < s2_cur.size(); ++j)
{
alignas(32) std::array<uint64_t, vecs> stored;
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
native_simd<VecType> X(stored.data());
native_simd<VecType> PM_j = andnot(X & boundMask, P_flag);

P_flag |= blsi(PM_j);
T_flag |= andnot(counter, (PM_j == zero));

counter = counter << 1;
boundMask = boundMask << 1;
}

auto counts = popcount(P_flag);
alignas(32) std::array<VecType, vec_width> P_flags;
P_flag.store(P_flags.data());
Expand Down
2 changes: 2 additions & 0 deletions test/distance/tests-Jaro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ TEST_CASE("JaroTest")
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.994872));
REQUIRE(jaro_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.989744));

REQUIRE(jaro_sim_test(std::string("01"), std::string("1111100000")) == Approx(0.53333333));

REQUIRE(jaro_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
REQUIRE(jaro_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.922233));
REQUIRE(jaro_sim_test(
Expand Down
2 changes: 2 additions & 0 deletions test/distance/tests-JaroWinkler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ TEST_CASE("JaroWinklerTest")
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 64), str_multiply(std::string("0"), 65)) == Approx(0.996923));
REQUIRE(jaro_winkler_sim_test(str_multiply(std::string("0"), 63), str_multiply(std::string("0"), 65)) == Approx(0.993846));

REQUIRE(jaro_winkler_sim_test(std::string("01"), std::string("1111100000")) == Approx(0.53333333));

REQUIRE(jaro_winkler_sim_test(std::string("10000000000000000000000000000000000000000000000000000000000000020"), std::string("00000000000000000000000000000000000000000000000000000000000000000")) == Approx(0.979487));
REQUIRE(jaro_winkler_sim_test(std::string("00000000000000100000000000000000000000010000000000000000000000000"), std::string("0000000000000000000000000000000000000000000000000000000000000000000000000000001")) == Approx(0.95334));
REQUIRE(jaro_winkler_sim_test(
Expand Down

0 comments on commit 7a30d6b

Please sign in to comment.