Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the tokenizer for efficiency #797

Merged
merged 11 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pipelines/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,14 @@ stages:
- bash: |
set -e -x -u
./build.sh -DOCOS_ENABLE_C_API=ON
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with API enabled and run tests

- bash: |
set -e -x -u
./build.sh -DOCOS_BUILD_PRESET=token_api_only -DOCOS_BUILD_SHARED_LIB=OFF
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo --output-on-failure
displayName: Build ort-extensions with tokenizer API only enabled and run tests

Expand Down
2 changes: 1 addition & 1 deletion docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The package contains all custom operators and some Python scripts to manipulate
- no-azure: disable AzureOp kernel build in Python package.
- no-opencv: disable operators based on OpenCV in build.
- cc-debug: generate debug info for extensions binaries and disable C/C++ compiler optimization.
- pp_api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *`
- pp-api: enable pre-processing C ABI Python wrapper, `from onnxruntime_extensions.pp_api import *`
- cuda-archs: specify the CUDA architectures(like 70, 85, etc.), and the multiple values can be combined with semicolon. The default value is nvidia-smi util output of GPU-0
- ort\_pkg\_dir: specify ONNXRuntime package directory the extension project is depending on. This is helpful if you want to use some ONNXRuntime latest function which has not been involved in the official build

Expand Down
5 changes: 3 additions & 2 deletions onnxruntime_extensions/_hf_cvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def convert_json_vocab(hf_tokenizer):
model_dir = hf_tokenizer.name_or_path
else:
model_dir = os.path.dirname(vocab_file)
tokenizer_json = json.load(
open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8"))
f = open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")
tokenizer_json = json.load(f)
f.close()
# get vocab object from json file
vocab = tokenizer_json.get("model", {}).get("vocab", {})
sorted_merges = tokenizer_json.get("model", {}).get("merges", [])
Expand Down
6 changes: 4 additions & 2 deletions onnxruntime_extensions/pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from . import _extensions_pydll as _C
if not hasattr(_C, "delete_object"):
raise ImportError(
"onnxruntime_extensions is not built with pre-processing C API"
"onnxruntime_extensions is not built with pre-processing C API\n"
"To enable it, please build the package with --ortx-user-option=pp_api")

create_processor = _C.create_processor
Expand All @@ -24,6 +24,7 @@

class Tokenizer:
def __init__(self, tokenizer_dir):
self.tokenizer = None
if os.path.isdir(tokenizer_dir):
self.tokenizer = create_tokenizer(tokenizer_dir)
else:
Expand All @@ -41,7 +42,8 @@ def __init__(self, tokenizer_dir):
f"Downloaded HF file '{resolved_full_file}' cannot be found")
if (os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file)):
raise FileNotFoundError(
f"Downloaded HF files '{resolved_full_file}' and '{resolved_config_file}' are not in the same directory")
f"Downloaded HF files '{resolved_full_file}' "
f"and '{resolved_config_file}' are not in the same directory")

tokenizer_dir = os.path.dirname(resolved_full_file)
self.tokenizer = create_tokenizer(tokenizer_dir)
Expand Down
102 changes: 62 additions & 40 deletions operators/tokenizer/bpe_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ ustring RemoveConsecutiveSpaces(const ustring& input) {
KernelBpeTokenizer::KernelBpeTokenizer(const BpeModelConf& conf)
: bpe_conf_(conf) {
model_name_ = conf.name_ == nullptr ? "" : conf.name_;
CreateUnicodeByteEncoder();
};

OrtStatusPtr KernelBpeTokenizer::OnModelAttach(const OrtApi& api, const OrtKernelInfo& info) {
Expand Down Expand Up @@ -175,12 +176,28 @@ uint32_t KernelBpeTokenizer::GetTokenId(const std::string& token) const {
return bbpe_tokenizer_->GetTokenId(token);
}

/*
Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454

Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes
we need to store the strings rather than their IDs, and thereby need a separate map.
*/
void KernelBpeTokenizer::CreateUnicodeByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++);
} else {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i);
}
}
}

std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
int64_t max_length,
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const {
std::vector<int64_t> res;
std::list<std::pair<uint32_t, uint32_t>> byte_list;

bool clean_up_spaces = false;
if (ModelName() == kModel_CLIP) {
Expand All @@ -191,10 +208,10 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
text = text.strip()
*/
ustring str = RemoveConsecutiveSpaces(input);
if (IsUnicodeSpace(str.front())) {
if (!str.empty() && IsUnicodeSpace(str.front())) {
str.erase(str.begin());
}
if (IsUnicodeSpace(str.back())) {
if (!str.empty() && IsUnicodeSpace(str.back())) {
str.pop_back();
}
// remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
Expand Down Expand Up @@ -274,24 +291,43 @@ std::vector<int64_t> KernelBpeTokenizer::Tokenize(ustring& input,
}
}

// Get byte encodings prior to performing BPE
byte_list.clear();

std::list<std::pair<uint32_t, uint32_t>> byte_list;
std::string token_bytes;
token_bytes.reserve(utf8_token.size() * 2);
size_t token_len = utf8_token.length();
size_t end_diff = 0;
if (clean_up_spaces) {
// Whitespace clean
utf8_token.erase(std::remove(utf8_token.begin(), utf8_token.end(), U' '), utf8_token.end());
token_len = utf8_token.length() - 1;
}

for (int i = 0; i < utf8_token.length(); i++) {
if (i == utf8_token.length() - 1) {
std::string boundary(1, utf8_token[i]);
byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(boundary + "</w>"), 1));
} else {
byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(utf8_token[i])], 1));
}
for (size_t i = 0; i < token_len; i++) {
token_bytes += unicode_byte_encoder_[static_cast<unsigned char>(utf8_token[i])];
}

if (clean_up_spaces) {
end_diff = token_bytes.length();
if (!utf8_token.empty()) {
token_bytes += unicode_byte_encoder_[static_cast<unsigned char>(utf8_token.back())];
token_bytes += "</w>";
}
end_diff = token_bytes.length() - end_diff;
}

auto id = bbpe_tokenizer_->GetTokenId(token_bytes);
if (id != bpe::kInvalidTokenId) {
byte_list.push_back(std::make_pair(id, ort_extensions::narrow<uint32_t>(utf8_token.size())));
} else {
for (char& cp : utf8_token) {
byte_list.push_back(std::make_pair(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)], 1));
token_len = token_bytes.length();
for (size_t i = 0; i < token_len - end_diff; /* i++ */) {
size_t j = ustring::UTF8Len(token_bytes[i]);
byte_list.push_back(std::make_pair(bbpe_tokenizer_->GetTokenId(token_bytes.substr(i, j)), ort_extensions::narrow<uint32_t>(j)));
i += j;
}
if (end_diff > 0) {
byte_list.push_back(std::make_pair(
bbpe_tokenizer_->GetTokenId(token_bytes.substr(token_len - end_diff, end_diff)), ort_extensions::narrow<uint32_t>(end_diff)));
}
}

Expand Down Expand Up @@ -343,7 +379,6 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const {
std::vector<int64_t> res;
std::list<std::pair<uint32_t, uint32_t>> byte_list;

// Add BOS token to result
res.push_back(bos_token_id_);
Expand Down Expand Up @@ -379,7 +414,7 @@ std::vector<int64_t> KernelBpeTokenizer::SpmTokenize(ustring& input,
}

// Get byte encodings prior to performing BPE
byte_list.clear();
std::list<std::pair<uint32_t, uint32_t>> byte_list;

while (res.size() < max_length && char_pos < ustr.length()) {
auto chr = ustr[char_pos];
Expand Down Expand Up @@ -559,23 +594,6 @@ SpmTokenizer::SpmTokenizer()

JsonFastTokenizer::JsonFastTokenizer() : KernelBpeTokenizer(kGPT2Configuration) {}

/*
Read more here: https://github.com/huggingface/transformers/blob/60bb571e993b7d73257fb64044726b569fef9403/src/transformers/convert_slow_tokenizer.py#L1454

Note: this is similar to the BPE CreateByteEncoder, however for decoding the .tiktoken bytes
we need to store the strings rather than their IDs, and thereby need a separate map.
*/
void JsonFastTokenizer::CreateUnicodeByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(index++);
} else {
unicode_byte_encoder_[i] = ustring::EncodeUTF8Char(i);
}
}
}

std::string JsonFastTokenizer::TokenBytesToString(std::vector<uint8_t>& bytes) {
std::string result;
for (auto c : bytes) {
Expand Down Expand Up @@ -647,7 +665,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
std::vector<std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, uint32_t>> byte_merges;

bbpe_tokenizer_ = std::make_unique<BpeModel>();
JsonFastTokenizer::CreateUnicodeByteEncoder();

for (const auto& item : bpe_ranks) {
std::vector<uint8_t> token = item.first;
Expand Down Expand Up @@ -714,13 +731,19 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
module_ifs >> tok_json;
} else {
ifs >> tok_json;
// doesn't work for json with nested objects
// auto decoders_node = tok_json.find("/decoder/decoders"_json_pointer);
auto decoders_node = tok_json.find("decoder");
if (decoders_node != tok_json.end()) {
decoders_node = decoders_node->find("decoders");
bool has_decoders_node = false;
auto decoders_node = tok_json.end();
auto decoder_node = tok_json.find("decoder");
if (decoder_node != tok_json.end()) {
decoders_node = decoder_node->find("decoders");
if (decoders_node != decoder_node->end()) {
has_decoders_node = true;
}
}

if (decoders_node->is_array()) {
if (has_decoders_node && decoders_node->is_array()) {
for(auto step = decoders_node->begin(); step != decoders_node->end(); ++step) {
std::string type = step->value("type", "");
if (type == "Replace") {
Expand All @@ -742,7 +765,6 @@ OrtxStatus JsonFastTokenizer::Load(const ort_extensions::bpe::TokenJsonConfig& c
bpe_conf_.get().GetSpecialTokens().c_str(),
bpe_conf_.get().spm_model_);
}


auto added_tokens = tok_json.find("added_tokens");
if (added_tokens != tok_json.end()) {
Expand Down
5 changes: 3 additions & 2 deletions operators/tokenizer/bpe_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ struct KernelBpeTokenizer {
bool compute_offset_mapping,
std::list<OffsetMappingType>& offset_map) const;

void CreateUnicodeByteEncoder();

protected:
std::reference_wrapper<BpeModelConf const> bpe_conf_;
std::string model_name_;
Expand All @@ -60,6 +62,7 @@ struct KernelBpeTokenizer {

std::optional<bool> add_bos_token_;
std::optional<bool> add_eos_token_;
std::string unicode_byte_encoder_[256] = {};
};

struct GPT2Tokenizer : KernelBpeTokenizer {
Expand Down Expand Up @@ -122,10 +125,8 @@ class JsonFastTokenizer : public KernelBpeTokenizer {
bool tiktoken_ = false;

private:
void CreateUnicodeByteEncoder();
std::string TokenBytesToString(std::vector<uint8_t>& bytes);

BpeModelConf json_conf_;
std::vector<ort_extensions::bpe::AddedToken> added_tokens_;
std::string unicode_byte_encoder_[256] = {};
};
1 change: 1 addition & 0 deletions operators/tokenizer/bpe_streaming.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
ptrdiff_t z = ustring::ValidateUTF8(text);
if (z <= 0) {
text = text.substr(0, -z);
text += "\ufffd"; // bad utf-8 string
}

decoded_strings.emplace_back(std::move(text));
Expand Down
25 changes: 0 additions & 25 deletions operators/tokenizer/bpe_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ class BpeModel {

if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}

uint32_t index = 0;
Expand Down Expand Up @@ -142,8 +140,6 @@ class BpeModel {

if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}

uint32_t index = 0;
Expand Down Expand Up @@ -196,8 +192,6 @@ class BpeModel {

if (spm_converted) {
UpdateSpmByteToken(vocab_map_);
} else {
CreateByteEncoder();
}

uint32_t index = 0;
Expand Down Expand Up @@ -336,8 +330,6 @@ class BpeModel {
}
}

const auto& ByteEncoder() const { return byte_encoder_; }

uint32_t GetTokenId(const std::string& key) const {
auto it = vocab_map_.find(key);
if (it != vocab_map_.end()) {
Expand Down Expand Up @@ -370,27 +362,10 @@ class BpeModel {
return (static_cast<uint64_t>(i1) << 32) | (i0 & 0xFFFFFFFFLL);
}

void CreateByteEncoder() {
char32_t index = 256;
for (char32_t i = 0; i < 256; ++i) {
/*
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
*/
if ((i >= 0 && i < 33) || (i >= 127 && i < 161) || (i == 173)) {
byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(index++));
} else {
byte_encoder_[i] = GetTokenId(ustring::EncodeUTF8Char(i));
}
}
}

private:
std::string end_of_word_suffix_;
std::map<uint64_t, BpeNode> bpe_rank_;

uint32_t byte_encoder_[256] = {};
std::unordered_map<std::string, uint32_t> vocab_map_;
std::vector<std::string> id2token_map_;

Expand Down
2 changes: 1 addition & 1 deletion shared/api/tokenizer_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ OrtxStatus TokenizerImpl::BatchDecode(const std::vector<span<extTokenId_t const>
if (!status.IsOk()) {
return status;
}
t_text.emplace_back(ts_output.AsScalar());
t_text.push_back(ts_output.AsScalar());
}
return {};
}
Expand Down
10 changes: 10 additions & 0 deletions test/pp_api_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,16 @@ TEST(OrtxTokenizerTest, CodeGenTokenizer) {
EXPECT_TRUE(status.IsOk());
// std::cout << out_text[0] << std::endl;
EXPECT_EQ(out_text[0], input[0]);

// 252 and the following ids cannot be decoded as a valid utf-8 string
std::vector<extTokenId_t> invalid_token_ids_span = {14675, 8466, 705, 252, 538, 5374, 82, 329, 4554};
std::vector<std::string> out_text1;
status = tokenizer->Detokenize({ort_extensions::span<const extTokenId_t>(invalid_token_ids_span)}, out_text1);
EXPECT_TRUE(status.IsOk());
EXPECT_EQ(out_text1.size(), 1);
std::string out_text_ref = out_text1.back();
std::cout << out_text_ref << std::endl;
EXPECT_EQ(out_text_ref.substr(out_text_ref.length() - 3, 3), "\ufffd");
}

TEST(OrtxTokenizerStreamTest, CodeGenTokenizer) {
Expand Down
Loading
Loading