From 46f395ae1d459b68cc9828af9ac8828e5d9f5588 Mon Sep 17 00:00:00 2001
From: Jack Zhou <zhoushunjie@baidu.com>
Date: Fri, 16 Sep 2022 21:31:10 +0800
Subject: [PATCH] Fix ft substr bug (#3279)

* optimize cmakelist

* Add substr pos check
---
 .../faster_tokenizer/CMakeLists.txt            |  2 +-
 .../faster_tokenizer/core/added_vocabulary.cc  |  2 +-
 .../models/faster_wordpiece.cc                 |  6 ++++--
 .../faster_tokenizer/models/wordpiece.cc       |  2 ++
 .../faster_tokenizer/normalizers/normalizer.cc | 18 +++++++++++++++---
 .../faster_tokenizer/utils/utils.cc            |  1 +
 6 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/faster_tokenizer/faster_tokenizer/CMakeLists.txt b/faster_tokenizer/faster_tokenizer/CMakeLists.txt
index c3d13de6e5f7..2fea6d18643a 100644
--- a/faster_tokenizer/faster_tokenizer/CMakeLists.txt
+++ b/faster_tokenizer/faster_tokenizer/CMakeLists.txt
@@ -6,7 +6,7 @@ add_subdirectory(postprocessors)
 add_subdirectory(core)
 add_subdirectory(utils)
 # set the relative path of shared library
-if (NOT APPLE)
+if (UNIX)
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
 endif()
 
diff --git a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc b/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc
index 8b047691804d..2c6d47dcf0ca 100644
--- a/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc
+++ b/faster_tokenizer/faster_tokenizer/core/added_vocabulary.cc
@@ -293,7 +293,7 @@ bool AddedVocabulary::FindMatch(const std::string& sequence,
     if (added_tokens.GetIsSingleWord()) {
       bool start_space =
           (curr_start == 0) || !EndWithWord(sequence.substr(0, curr_start));
-      bool stop_space = (curr_end == sequence.length()) ||
+      bool stop_space = (curr_end >= sequence.length()) ||
                         !StartWithWord(sequence.substr(curr_end));
       if (!start_space || !stop_space) {
         // Discard not single word
diff --git a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc b/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc
index c340d8095337..4272b8a3c5a0 100644
--- a/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc
+++ b/faster_tokenizer/faster_tokenizer/models/faster_wordpiece.cc
@@ -133,6 +133,8 @@ void FasterWordPiece::AppendTokensToOutput(
   if (id == unk_token_id_) {
     value = unk_token_;
   } else {
+    auto c_offset = *curr_offset_in_sequence;
+    c_offset = (std::min)(c_offset, static_cast<int>(sequence.length() - 1));
     value = sequence.substr(*curr_offset_in_sequence, token_substr_length);
   }
 
@@ -286,7 +288,7 @@ std::vector<core::Token> FasterWordPiece::TokenizeWithoutPreTokenize(
                                        &all_tokens);
   }
   if (all_tokens.size() == 0) {
-      ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens);
+    ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens);
   }
   VLOG(6) << "All tokens num from TokenizeWithoutPreTokenize: "
           << all_tokens.size();
@@ -374,7 +376,7 @@ std::vector<core::Token> FasterWordPiece::TokenizeWithPreTokenize(
                          &all_tokens);
   }
   if (all_tokens.size() == 0) {
-      ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens);
+    ResetOutputAppendUNK(0, sequence.size(), &original_num_tokens, &all_tokens);
   }
   VLOG(6) << "All tokens num from TokenizeWithPreTokenize: "
           << all_tokens.size();
diff --git a/faster_tokenizer/faster_tokenizer/models/wordpiece.cc b/faster_tokenizer/faster_tokenizer/models/wordpiece.cc
index 8bc7fd96a645..55e844a100bc 100644
--- a/faster_tokenizer/faster_tokenizer/models/wordpiece.cc
+++ b/faster_tokenizer/faster_tokenizer/models/wordpiece.cc
@@ -185,6 +185,7 @@ core::Vocab WordPiece::GetVocabFromFile(const std::string& file) {
     std::string word_str = word;
     auto leading_spaces = word_str.find_first_not_of(WHITESPACE);
     if (leading_spaces != std::string::npos) {
+      leading_spaces = (std::min)(leading_spaces, word_str.length() - 1);
       word_str = word_str.substr(leading_spaces);
     }
     auto trailing_spaces = word_str.find_last_not_of(WHITESPACE);
@@ -275,6 +276,7 @@ void WordPieceFactory::GetVocabFromFiles(const std::string& files) {
     std::string word_str = word;
     auto leading_spaces = word_str.find_first_not_of(WHITESPACE);
     if (leading_spaces != std::string::npos) {
+      leading_spaces = (std::min)(leading_spaces, word_str.length() - 1);
       word_str = word_str.substr(leading_spaces);
     }
     auto trailing_spaces = word_str.find_last_not_of(WHITESPACE);
diff --git a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc b/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc
index 4296725eb179..c4a9bfb63475 100644
--- a/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc
+++ b/faster_tokenizer/faster_tokenizer/normalizers/normalizer.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "faster_tokenizer/normalizers/normalizer.h"
 #include "faster_tokenizer/utils/utf8.h"
 
-#include "glog/logging.h"
 #include "faster_tokenizer/normalizers/unicode.h"
+#include "glog/logging.h"
 #include "re2/re2.h"
 #include "unicode/edits.h"
 #include "unicode/errorcode.h"
@@ -100,6 +100,8 @@ void NormalizedString::UpdateNormalizedRange(
   // Retrieve the original characters that are being replaced. This let us
   // compute the change in byte sizes along the way.
   std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+  n_range.first = (std::min)(n_range.first,
+                             static_cast<uint32_t>(normalized_.length() - 1));
   std::u32string u32replaced_normalized = conv.from_bytes(
       normalized_.substr(n_range.first, n_range.second - n_range.first));
   uint32_t initial_removed = 0;
@@ -332,12 +334,14 @@ NormalizedString& NormalizedString::RStrip() { return LRStrip(false, true); }
 const std::string WHITESPACE = " \n\r\t\f\v";
 
 NormalizedString& NormalizedString::LRStrip(bool left, bool right) {
-  int leading_spaces = 0;
-  int trailing_spaces = 0;
+  uint32_t leading_spaces = 0;
+  uint32_t trailing_spaces = 0;
   std::string new_normalized = normalized_;
   if (left) {
     leading_spaces = new_normalized.find_first_not_of(WHITESPACE);
     if (leading_spaces != std::string::npos) {
+      leading_spaces = (std::min)(
+          leading_spaces, static_cast<uint32_t>(new_normalized.length() - 1));
       new_normalized = new_normalized.substr(leading_spaces);
     }
   }
@@ -534,8 +538,16 @@ bool NormalizedString::Slice(core::Range range,
       ConvertOffsets(&original_range, false);
     }
     uint32_t n_shift = original_range.first;
+
+    original_range.first =
+        (std::min)(original_range.first,
+                   static_cast<uint32_t>(this->original_.length() - 1));
     normalized->original_ = this->original_.substr(
         original_range.first, original_range.second - original_range.first);
+
+    normalized_range.first =
+        (std::min)(normalized_range.first,
+                   static_cast<uint32_t>(this->normalized_.length() - 1));
     normalized->normalized_ = this->normalized_.substr(
         normalized_range.first,
         normalized_range.second - normalized_range.first);
diff --git a/faster_tokenizer/faster_tokenizer/utils/utils.cc b/faster_tokenizer/faster_tokenizer/utils/utils.cc
index 18370b285abc..e10aa9af398d 100644
--- a/faster_tokenizer/faster_tokenizer/utils/utils.cc
+++ b/faster_tokenizer/faster_tokenizer/utils/utils.cc
@@ -39,6 +39,7 @@ void GetVocabFromFiles(const std::string& files,
     std::string word_str = word;
     auto leading_spaces = word_str.find_first_not_of(WHITESPACE);
     if (leading_spaces != std::string::npos) {
+      leading_spaces = (std::min)(leading_spaces, word_str.length() - 1);
       word_str = word_str.substr(leading_spaces);
     }
     auto trailing_spaces = word_str.find_last_not_of(WHITESPACE);