Skip to content

Commit

Permalink
properly take boundary cells into account
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Jul 2, 2024
1 parent d19dad8 commit cbdf843
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 4 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
## Changelog

## [3.0.5] - 2024-07-02
### Fixed
- the editops implementation didn't properly account for some cells in the Levenshtein matrix.
This could lead both to incorrect results and crashes.

## [3.0.4] - 2023-04-07
### Fixed
- fix tagged version
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
endif()

project(rapidfuzz LANGUAGES CXX VERSION 3.0.4)
project(rapidfuzz LANGUAGES CXX VERSION 3.0.5)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
include(GNUInstallDirs)
Expand Down
19 changes: 17 additions & 2 deletions extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2024-04-06 15:39:26.940916
// Generated: 2024-07-02 16:47:26.932914
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -7719,6 +7719,9 @@ template <typename InputIt1, typename InputIt2>
HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt2>& s2,
size_t max = std::numeric_limits<size_t>::max())
{
assert(s1.size() > 1);
assert(s2.size() > 1);

HirschbergPos hpos = {};
size_t left_size = s2.size() / 2;
size_t right_size = s2.size() - left_size;
Expand All @@ -7727,8 +7730,9 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
size_t best_score = std::numeric_limits<size_t>::max();
size_t right_first_pos = 0;
size_t right_last_pos = 0;
// todo: we could avoid this allocation by counting up the right score twice
// not sure whats faster though
std::vector<size_t> right_scores;

{
auto right_row = levenshtein_row(s1.reversed(), s2.reversed(), max, right_size - 1);
if (right_row.dist > max) return find_hirschberg_pos(s1, s2, max * 2);
Expand Down Expand Up @@ -7758,6 +7762,17 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
auto left_last_pos = std::min(s1_len, left_row.last_block * 64 + 64);

size_t left_score = left_row.prev_score;
// take boundary into account
if (s1_len >= left_first_pos + right_first_pos) {
size_t right_index = s1_len - left_first_pos - right_first_pos;
if (right_index < right_scores.size()) {
best_score = right_scores[right_index] + left_score;
hpos.left_score = left_score;
hpos.right_score = right_scores[right_index];
hpos.s1_mid = left_first_pos;
}
}

for (size_t i = left_first_pos; i < left_last_pos; ++i) {
size_t col_pos = i % 64;
size_t col_word = i / 64;
Expand Down
17 changes: 16 additions & 1 deletion rapidfuzz/distance/Levenshtein_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,9 @@ template <typename InputIt1, typename InputIt2>
HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt2>& s2,
size_t max = std::numeric_limits<size_t>::max())
{
assert(s1.size() > 1);
assert(s2.size() > 1);

HirschbergPos hpos = {};
size_t left_size = s2.size() / 2;
size_t right_size = s2.size() - left_size;
Expand All @@ -1063,8 +1066,9 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
size_t best_score = std::numeric_limits<size_t>::max();
size_t right_first_pos = 0;
size_t right_last_pos = 0;
// todo: we could avoid this allocation by counting up the right score twice
// not sure whats faster though
std::vector<size_t> right_scores;

{
auto right_row = levenshtein_row(s1.reversed(), s2.reversed(), max, right_size - 1);
if (right_row.dist > max) return find_hirschberg_pos(s1, s2, max * 2);
Expand Down Expand Up @@ -1094,6 +1098,17 @@ HirschbergPos find_hirschberg_pos(const Range<InputIt1>& s1, const Range<InputIt
auto left_last_pos = std::min(s1_len, left_row.last_block * 64 + 64);

size_t left_score = left_row.prev_score;
// take boundary into account
if (s1_len >= left_first_pos + right_first_pos) {
size_t right_index = s1_len - left_first_pos - right_first_pos;
if (right_index < right_scores.size()) {
best_score = right_scores[right_index] + left_score;
hpos.left_score = left_score;
hpos.right_score = right_scores[right_index];
hpos.s1_mid = left_first_pos;
}
}

for (size_t i = left_first_pos; i < left_last_pos; ++i) {
size_t col_pos = i % 64;
size_t col_word = i / 64;
Expand Down

0 comments on commit cbdf843

Please sign in to comment.