Skip to content

Commit

Permalink
Add String Mapping Op (#42)
Browse files Browse the repository at this point in the history
* Add VocabEncoder

* Lower Pass Rate Due To Tokenizer Type Change
  • Loading branch information
apaniukov authored Mar 1, 2024
1 parent a1afb67 commit c89b9fa
Show file tree
Hide file tree
Showing 17 changed files with 173 additions and 40 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >Tiktoken</td>
<td >97.25</td>
<td >96.94</td>
<td >327</td>
</tr>
<tr>
Expand Down Expand Up @@ -547,7 +547,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >Tiktoken</td>
<td >stabilityai/stablelm-2-1_6b</td>
<td >96.33</td>
<td >95.41</td>
<td >109</td>
</tr>
<tr>
Expand Down
29 changes: 25 additions & 4 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,6 @@ def byte_level_splitter(cls) -> "RegexSplitStep":
behaviour="isolate",
)

@classmethod
def add_whitespace_to_the_next_word(cls):
return cls(r"\s\S", invert=False, behaviour="merge_with_next")

@classmethod
def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
return cls(
Expand Down Expand Up @@ -338,6 +334,31 @@ class TokenizationModelStep(BasePipelineStep):
pass


@dataclass
class VocabEncoderStep(TokenizationModelStep):
vocab: List[str] = field(repr=False)
vocab_values: Optional[List[int]] = None
default_value: int = -1

def __post_init__(self) -> None:
if self.vocab_values is None:
self.vocab_values = list(range(len(self.vocab)))

def get_vocab_node_outputs(self) -> Optional[List[Output]]:
return self.get_pipeline().vocab_node_outputs

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
self.create_string_constant_node(self.vocab).outputs()
input_nodes.extend(
(
*self.create_string_constant_node(self.vocab).outputs(),
make_constant_node(np.array(self.vocab_values, dtype=np.int32), Type.i32),
make_constant_node(self.default_value, Type.i32) # default_value
)
)
return _get_factory().create("VocabEncoder", input_nodes).outputs()


@dataclass
class WordPieceTokenizationStep(TokenizationModelStep):
vocab: List[str] = field(repr=False)
Expand Down
4 changes: 2 additions & 2 deletions src/bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

#include "bpe_tokenizer.hpp"
#include "utils.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;

#undef tokenizer

Expand Down
1 change: 1 addition & 0 deletions src/ov_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ OPENVINO_CREATE_EXTENSIONS(
std::make_shared<ov::OpExtension<BytesToChars>>(),
std::make_shared<ov::OpExtension<CombineSegments>>(),
std::make_shared<ov::OpExtension<RaggedToDense>>(),
std::make_shared<ov::OpExtension<VocabEncoder>>(),
std::make_shared<ov::OpExtension<VocabDecoder>>(),
std::make_shared<ov::OpExtension<CharsToBytes>>(),
std::make_shared<ov::OpExtension<TemplateExtension::SentencepieceTokenizer>>(),
Expand Down
4 changes: 2 additions & 2 deletions src/regex_normalization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
#include "normalizer.h" // for absl::string_view

#include <openvino/op/op.hpp>
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"
#include "fast_tokenizer/normalizers/normalizers.h"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;

class RegexNormalization : public ov::op::Op {
public:
Expand Down
4 changes: 2 additions & 2 deletions src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
//

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

#include "regex_split.hpp"
#include "utils.hpp"
#include "fast_tokenizer/normalizers/normalizers.h"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;

namespace {

Expand Down
2 changes: 1 addition & 1 deletion src/regex_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#pragma once

#include <openvino/op/op.hpp>
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"
#include "fast_tokenizer/pretokenizers/pretokenizers.h"

using namespace ov;
Expand Down
4 changes: 2 additions & 2 deletions src/sentence_piece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "model_interface.h"

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

#include "sentence_piece.hpp"
#include "utils.hpp"
Expand All @@ -18,7 +18,7 @@ using sentencepiece::util::Status;
using namespace TemplateExtension;
using namespace ov;
using namespace ov::frontend;
using namespace ov::opset10;
using namespace ov::opset13;

// TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor

Expand Down
4 changes: 2 additions & 2 deletions src/tensorflow_translators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

#include "tensorflow_translators.hpp"
#include "utils.hpp"
Expand All @@ -21,7 +21,7 @@
using namespace TemplateExtension;
using namespace ov;
using namespace ov::frontend;
using namespace ov::opset10;
using namespace ov::opset13;

namespace {
template<typename T>
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "wordpiece_tokenizer.hpp"
#include "bpe_tokenizer.hpp"
#include "ragged_to_dense.hpp"
#include "vocab_encoder.hpp"
#include "vocab_decoder.hpp"
#include "chars_to_bytes.hpp"

Expand Down
4 changes: 2 additions & 2 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
//

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"
#include "utils.hpp"
#include "string_tensor_pack.hpp"
#include "string_tensor_unpack.hpp"
#include "ragged_tensor_pack.hpp"

using namespace ov;
using namespace ov::frontend;
using namespace ov::opset10;
using namespace ov::opset13;

void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) {
auto strings = packed.data<const uint8_t>();
Expand Down
27 changes: 11 additions & 16 deletions src/vocab_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,8 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
auto vocab_chars = inputs[3].data<const uint8_t>();
auto vocab_size = inputs[1].get_size();

std::vector<std::vector<uint8_t>> vocab;
vocab.resize(vocab_size);

std::vector<uint8_t> empty = {};

OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern");

for(size_t id = 0; id < vocab_size; ++id) {
vocab[id] = std::vector<uint8_t>(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]);
}
// Set output shapes
outputs[0].set_shape({batch_size});
outputs[1].set_shape({batch_size});
Expand All @@ -62,17 +54,20 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i

for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) {
auto token_id = input_data[seq];
std::vector<uint8_t> token;
if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
token = vocab[token_id];
} else {
token = empty;
int token_size = 0;
if (token_id >= vocab_size) {
OPENVINO_THROW("Token id is greater then vocabulary size.");
} else if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
std::copy(
vocab_chars + vocab_begins[token_id],
vocab_chars + vocab_ends[token_id],
&new_chars[char_offset]
);
token_size = vocab_ends[token_id] - vocab_begins[token_id];
}

std::copy(token.begin(), token.end(), &new_chars[char_offset]);

new_begins[seq] = char_offset;
char_offset += token.size();
char_offset += token_size;
new_ends[seq] = char_offset;
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/vocab_decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ class VocabDecoder : public ov::op::Op {
const ov::OutputVector& arguments,
std::vector<int> skip_tokens
) :
ov::op::Op(arguments) {
m_skip_tokens = skip_tokens;
ov::op::Op(arguments), m_skip_tokens(skip_tokens) {
constructor_validate_and_infer_types();
}

Expand Down
78 changes: 78 additions & 0 deletions src/vocab_encoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
//
#ifdef _MSC_VER
# pragma warning(disable : 4251)
# pragma warning(disable : 4275)
#endif

#include "vocab_encoder.hpp"
#include "utils.hpp"

using namespace ov;


VocabEncoder::VocabEncoder (const ov::OutputVector& arguments) :
ov::op::Op(arguments) {
constructor_validate_and_infer_types();
}


void VocabEncoder::validate_and_infer_types() {
// main string input
check_string_input(this, 0);
// vocab keys
check_string_input(this, 3);
// vocab values
FRONT_END_GENERAL_CHECK(this->get_input_element_type(6) == element::i32, "Expected an i32 tensor for VocabEncode values.");
// vocab.size == vocab_values.size when vocab is static
FRONT_END_GENERAL_CHECK(
this->get_input_partial_shape(3).is_dynamic() || this->get_input_partial_shape(3) == this->get_input_partial_shape(6),
"Expected equal number of vocab keys and values."
);
// Default value is compatible to vocab values
FRONT_END_GENERAL_CHECK(get_input_element_type(6).compatible(get_input_element_type(7)));
// one data output, reuse ragged dimensions from split
this->set_output_type(0, element::i32, get_input_partial_shape(0));
}


bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
// string inputs
auto begins = inputs[0].data<const int32_t>();
auto ends = inputs[1].data<const int32_t>();
auto chars = inputs[2].data<const uint8_t>();

// vocab string keys
auto vocab_begins = inputs[3].data<const int32_t>();
auto vocab_ends = inputs[4].data<const int32_t>();
auto vocab_chars = inputs[5].data<const uint8_t>();

auto vocab_values = inputs[6].data<const int32_t>();
auto vocab_size = inputs[6].get_size();

std::map<std::vector<uint8_t>, int32_t> vocab;
for (size_t i = 0; i < vocab_size; ++i) {
std::vector<uint8_t> token = std::vector(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
vocab[token] = vocab_values[i];
};

auto default_value = *inputs[7].data<const int32_t>();
const size_t num_elements = inputs[0].get_size();

// Set output shape
outputs[0].set_shape({num_elements});
auto token_ids = outputs[0].data<int32_t>();

for (size_t element_idx = 0; element_idx < num_elements; ++element_idx) {
auto element = vocab.find(std::vector(chars + begins[element_idx], chars + ends[element_idx]));
if (element == vocab.end()) {
token_ids[element_idx] = default_value;
} else {
token_ids[element_idx] = element->second;
};
};

return true;
}
38 changes: 38 additions & 0 deletions src/vocab_encoder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
//
#pragma once
#include <vector>
#include <openvino/op/op.hpp>
#include "openvino/opsets/opset13.hpp"

using namespace ov;
using namespace ov::opset13;


class VocabEncoder : public ov::op::Op {
public:
OPENVINO_OP("VocabEncoder");

VocabEncoder () = default;
VocabEncoder(
const ov::OutputVector& arguments
);

void validate_and_infer_types() override;

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
return std::make_shared<VocabEncoder>(inputs);
}

bool visit_attributes(ov::AttributeVisitor& visitor) override {
return true;
}

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

bool has_evaluate() const override {
return true;
}
};
4 changes: 2 additions & 2 deletions src/wordpiece_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

#include "wordpiece_tokenizer.hpp"
#include "utils.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;


WordpieceTokenizer::WordpieceTokenizer(
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tokenizers_test.py::test_": 0.8700921600807978
"tokenizers_test.py::test_": 0.8699659133947734
}

0 comments on commit c89b9fa

Please sign in to comment.