From 46f395ae1d459b68cc9828af9ac8828e5d9f5588 Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Fri, 16 Sep 2022 21:31:10 +0800 Subject: [PATCH] Fix ft substr bug (#3279) * optimize cmakelist * Add substr pos check --- .../faster_tokenizer/CMakeLists.txt | 2 +- .../faster_tokenizer/core/added_vocabulary.cc | 2 +- .../models/faster_wordpiece.cc | 6 ++++-- .../faster_tokenizer/models/wordpiece.cc | 2 ++ .../faster_tokenizer/normalizers/normalizer.cc | 18 +++++++++++++++--- .../faster_tokenizer/utils/utils.cc | 1 + 6 files changed, 24 insertions(+), 7 deletions(-) diff --git a/faster_tokenizer/faster_tokenizer/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/CMakeLists.txt index c3d13de6e5f7..2fea6d18643a 100644 --- a/faster_tokenizer/faster_tokenizer/CMakeLists.txt +++ b/faster_tokenizer/faster_tokenizer/CMakeLists.txt @@ -6,7 +6,7 @@ add_subdirectory(postprocessors) add_subdirectory(core) add_subdirectory(utils) # set the relative path of shared library -if (NOT APPLE) +if (UNIX) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'") endif() diff --git a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc b/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc index 8b047691804d..2c6d47dcf0ca 100644 --- a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc +++ b/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc @@ -293,7 +293,7 @@ bool AddedVocabulary::FindMatch(const std::string& sequence, if (added_tokens.GetIsSingleWord()) { bool start_space = (curr_start == 0) || !EndWithWord(sequence.substr(0, curr_start)); - bool stop_space = (curr_end == sequence.length()) || + bool stop_space = (curr_end >= sequence.length()) || !StartWithWord(sequence.substr(curr_end)); if (!start_space || !stop_space) { // Discard not single word diff --git a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc b/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc index c340d8095337..4272b8a3c5a0 100644 --- a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc +++ b/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc @@ -133,6 +133,8 @@ void FasterWordPiece::AppendTokensToOutput( if (id == unk_token_id_) { value = unk_token_; } else { + auto c_offset = *curr_offset_in_sequence; + c_offset = (std::min)(c_offset, static_cast(sequence.length() - 1)); value = sequence.substr(*curr_offset_in_sequence, token_substr_length); } @@ -286,7 +288,7 @@ std::vector FasterWordPiece::TokenizeWithoutPreTokenize( &all_tokens); } if (all_tokens.size() == 0) { - ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens); + ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens); } VLOG(6) << "All tokens num from TokenizeWithoutPreTokenize: " << all_tokens.size(); @@ -374,7 +376,7 @@ std::vector FasterWordPiece::TokenizeWithPreTokenize( &all_tokens); } if (all_tokens.size() == 0) { - ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens); + ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens); } VLOG(6) << "All tokens num from TokenizeWithPreTokenize: " << all_tokens.size(); diff --git a/faster_tokenizer/faster_tokenizer/models/wordpiece.cc b/faster_tokenizer/faster_tokenizer/models/wordpiece.cc index 8bc7fd96a645..55e844a100bc 100644 --- a/faster_tokenizer/faster_tokenizer/models/wordpiece.cc +++ b/faster_tokenizer/faster_tokenizer/models/wordpiece.cc @@ -185,6 +185,7 @@ core::Vocab WordPiece::GetVocabFromFile(const std::string& file) { std::string word_str = word; auto leading_spaces = word_str.find_first_not_of(WHITESPACE); if (leading_spaces != std::string::npos) { + leading_spaces = (std::min)(leading_spaces, word_str.length() - 1); word_str = word_str.substr(leading_spaces); } auto trailing_spaces = word_str.find_last_not_of(WHITESPACE); @@ -275,6 +276,7 @@ void WordPieceFactory::GetVocabFromFiles(const std::string& files) { std::string word_str = word; auto leading_spaces = word_str.find_first_not_of(WHITESPACE); if (leading_spaces != std::string::npos) { + leading_spaces = (std::min)(leading_spaces, word_str.length() - 1); word_str = word_str.substr(leading_spaces); } auto trailing_spaces = word_str.find_last_not_of(WHITESPACE); diff --git a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc b/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc index 4296725eb179..c4a9bfb63475 100644 --- a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc +++ b/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "faster_tokenizer/normalizers/normalizer.h" #include "faster_tokenizer/utils/utf8.h" -#include "glog/logging.h" #include "faster_tokenizer/normalizers/unicode.h" +#include "glog/logging.h" #include "re2/re2.h" #include "unicode/edits.h" #include "unicode/errorcode.h" @@ -100,6 +100,8 @@ void NormalizedString::UpdateNormalizedRange( // Retrieve the original characters that are being replaced. This let us // compute the change in byte sizes along the way. std::wstring_convert, char32_t> conv; + n_range.first = (std::min)(n_range.first, + static_cast(normalized_.length() - 1)); std::u32string u32replaced_normalized = conv.from_bytes( normalized_.substr(n_range.first, n_range.second - n_range.first)); uint32_t initial_removed = 0; @@ -332,12 +334,14 @@ NormalizedString& NormalizedString::RStrip() { return LRStrip(false, true); } const std::string WHITESPACE = " \n\r\t\f\v"; NormalizedString& NormalizedString::LRStrip(bool left, bool right) { - int leading_spaces = 0; - int trailing_spaces = 0; + uint32_t leading_spaces = 0; + uint32_t trailing_spaces = 0; std::string new_normalized = normalized_; if (left) { leading_spaces = new_normalized.find_first_not_of(WHITESPACE); if (leading_spaces != std::string::npos) { + leading_spaces = (std::min)( + leading_spaces, static_cast(new_normalized.length() - 1)); new_normalized = new_normalized.substr(leading_spaces); } } @@ -534,8 +538,16 @@ bool NormalizedString::Slice(core::Range range, ConvertOffsets(&original_range, false); } uint32_t n_shift = original_range.first; + + original_range.first = + (std::min)(original_range.first, + static_cast(this->original_.length() - 1)); normalized->original_ = this->original_.substr( original_range.first, original_range.second - original_range.first); + + normalized_range.first = + (std::min)(normalized_range.first, + static_cast(this->normalized_.length() - 1)); normalized->normalized_ = this->normalized_.substr( normalized_range.first, normalized_range.second - normalized_range.first); diff --git a/faster_tokenizer/faster_tokenizer/utils/utils.cc b/faster_tokenizer/faster_tokenizer/utils/utils.cc index 18370b285abc..e10aa9af398d 100644 --- a/faster_tokenizer/faster_tokenizer/utils/utils.cc +++ b/faster_tokenizer/faster_tokenizer/utils/utils.cc @@ -39,6 +39,7 @@ void GetVocabFromFiles(const std::string& files, std::string word_str = word; auto leading_spaces = word_str.find_first_not_of(WHITESPACE); if (leading_spaces != std::string::npos) { + leading_spaces = (std::min)(leading_spaces, word_str.length() - 1); word_str = word_str.substr(leading_spaces); } auto trailing_spaces = word_str.find_last_not_of(WHITESPACE);