Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add String Mapping Op #42

Merged
merged 12 commits into from
Mar 1, 2024
29 changes: 25 additions & 4 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,6 @@ def byte_level_splitter(cls) -> "RegexSplitStep":
behaviour="isolate",
)

@classmethod
def add_whitespace_to_the_next_word(cls):
return cls(r"\s\S", invert=False, behaviour="merge_with_next")

rkazants marked this conversation as resolved.
Show resolved Hide resolved
@classmethod
def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
return cls(
Expand Down Expand Up @@ -338,6 +334,31 @@ class TokenizationModelStep(BasePipelineStep):
pass


@dataclass
class VocabEncoderStep(TokenizationModelStep):
vocab: List[str] = field(repr=False)
vocab_values: Optional[List[int]] = None
default_value: int = -1

def __post_init__(self) -> None:
if self.vocab_values is None:
self.vocab_values = list(range(len(self.vocab)))

def get_vocab_node_outputs(self) -> Optional[List[Output]]:
return self.get_pipeline().vocab_node_outputs

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
self.create_string_constant_node(self.vocab).outputs()
input_nodes.extend(
(
*self.create_string_constant_node(self.vocab).outputs(),
make_constant_node(np.array(self.vocab_values, dtype=np.int32), Type.i32),
make_constant_node(self.default_value, Type.i32) # default_value
)
)
return _get_factory().create("VocabEncoder", input_nodes).outputs()


@dataclass
class WordPieceTokenizationStep(TokenizationModelStep):
vocab: List[str] = field(repr=False)
Expand Down
4 changes: 2 additions & 2 deletions src/bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

#include "bpe_tokenizer.hpp"
#include "utils.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;

#undef tokenizer

Expand Down
1 change: 1 addition & 0 deletions src/ov_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ OPENVINO_CREATE_EXTENSIONS(
std::make_shared<ov::OpExtension<BytesToChars>>(),
std::make_shared<ov::OpExtension<CombineSegments>>(),
std::make_shared<ov::OpExtension<RaggedToDense>>(),
std::make_shared<ov::OpExtension<VocabEncoder>>(),
std::make_shared<ov::OpExtension<VocabDecoder>>(),
std::make_shared<ov::OpExtension<CharsToBytes>>(),
std::make_shared<ov::OpExtension<TemplateExtension::SentencepieceTokenizer>>(),
Expand Down
4 changes: 2 additions & 2 deletions src/regex_normalization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
#include "normalizer.h" // for absl::string_view

#include <openvino/op/op.hpp>
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"
#include "fast_tokenizer/normalizers/normalizers.h"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;

class RegexNormalization : public ov::op::Op {
public:
Expand Down
4 changes: 2 additions & 2 deletions src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
//

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

#include "regex_split.hpp"
#include "utils.hpp"
#include "fast_tokenizer/normalizers/normalizers.h"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;

namespace {

Expand Down
2 changes: 1 addition & 1 deletion src/regex_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#pragma once

#include <openvino/op/op.hpp>
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"
#include "fast_tokenizer/pretokenizers/pretokenizers.h"

using namespace ov;
Expand Down
4 changes: 2 additions & 2 deletions src/sentence_piece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "model_interface.h"

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

#include "sentence_piece.hpp"
#include "utils.hpp"
Expand All @@ -18,7 +18,7 @@ using sentencepiece::util::Status;
using namespace TemplateExtension;
using namespace ov;
using namespace ov::frontend;
using namespace ov::opset10;
using namespace ov::opset13;

// TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor

Expand Down
4 changes: 2 additions & 2 deletions src/tensorflow_translators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

#include "tensorflow_translators.hpp"
#include "utils.hpp"
Expand All @@ -21,7 +21,7 @@
using namespace TemplateExtension;
using namespace ov;
using namespace ov::frontend;
using namespace ov::opset10;
using namespace ov::opset13;

namespace {
template<typename T>
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "wordpiece_tokenizer.hpp"
#include "bpe_tokenizer.hpp"
#include "ragged_to_dense.hpp"
#include "vocab_encoder.hpp"
#include "vocab_decoder.hpp"
#include "chars_to_bytes.hpp"

Expand Down
4 changes: 2 additions & 2 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
//

#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"
#include "utils.hpp"
#include "string_tensor_pack.hpp"
#include "string_tensor_unpack.hpp"
#include "ragged_tensor_pack.hpp"

using namespace ov;
using namespace ov::frontend;
using namespace ov::opset10;
using namespace ov::opset13;

void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) {
auto strings = packed.data<const uint8_t>();
Expand Down
27 changes: 11 additions & 16 deletions src/vocab_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,8 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
auto vocab_chars = inputs[3].data<const uint8_t>();
auto vocab_size = inputs[1].get_size();

std::vector<std::vector<uint8_t>> vocab;
vocab.resize(vocab_size);

std::vector<uint8_t> empty = {};

OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern");

for(size_t id = 0; id < vocab_size; ++id) {
vocab[id] = std::vector<uint8_t>(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]);
}
// Set output shapes
outputs[0].set_shape({batch_size});
outputs[1].set_shape({batch_size});
Expand All @@ -62,17 +54,20 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i

for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) {
auto token_id = input_data[seq];
std::vector<uint8_t> token;
if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
token = vocab[token_id];
} else {
token = empty;
int token_size = 0;
if (token_id >= vocab_size) {
OPENVINO_THROW("Token id is greater then vocabulary size.");
} else if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
std::copy(
vocab_chars + vocab_begins[token_id],
vocab_chars + vocab_ends[token_id],
&new_chars[char_offset]
);
token_size = vocab_ends[token_id] - vocab_begins[token_id];
}

std::copy(token.begin(), token.end(), &new_chars[char_offset]);

new_begins[seq] = char_offset;
char_offset += token.size();
char_offset += token_size;
new_ends[seq] = char_offset;
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/vocab_decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ class VocabDecoder : public ov::op::Op {
const ov::OutputVector& arguments,
std::vector<int> skip_tokens
) :
ov::op::Op(arguments) {
m_skip_tokens = skip_tokens;
ov::op::Op(arguments), m_skip_tokens(skip_tokens) {
constructor_validate_and_infer_types();
}

Expand Down
80 changes: 80 additions & 0 deletions src/vocab_encoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
//
#ifdef _MSC_VER
# pragma warning(disable : 4251)
# pragma warning(disable : 4275)
#endif

#include <algorithm>

#include "vocab_encoder.hpp"
#include "utils.hpp"

using namespace ov;


VocabEncoder::VocabEncoder (const ov::OutputVector& arguments) :
ov::op::Op(arguments) {
constructor_validate_and_infer_types();
}


void VocabEncoder::validate_and_infer_types() {
// main string input
check_string_input(this, 0);
// vocab keys
check_string_input(this, 3);
// vocab values
FRONT_END_GENERAL_CHECK(this->get_input_element_type(6) == element::i32, "Expected an i32 tensor for VocabEncode values.");
// vocab.size == vocab_values.size when vocab is static
FRONT_END_GENERAL_CHECK(
this->get_input_partial_shape(3).is_dynamic() || this->get_input_partial_shape(3) == this->get_input_partial_shape(6),
"Expected equal number of vocab keys and values."
);
// Default value is compatible to vocab values
FRONT_END_GENERAL_CHECK(get_input_element_type(6).compatible(get_input_element_type(7)));
// one data output, reuse ragged dimensions from split
this->set_output_type(0, element::i32, get_input_partial_shape(0));
}


bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
// string inputs
auto begins = inputs[0].data<const int32_t>();
auto ends = inputs[1].data<const int32_t>();
auto chars = inputs[2].data<const uint8_t>();

// vocab string keys
auto vocab_begins = inputs[3].data<const int32_t>();
auto vocab_ends = inputs[4].data<const int32_t>();
auto vocab_chars = inputs[5].data<const uint8_t>();

auto vocab_values = inputs[6].data<const int32_t>();
auto vocab_size = inputs[6].get_size();

std::map<std::vector<uint8_t>, int32_t> vocab;
for (size_t i = 0; i < vocab_size; ++i) {
std::vector<uint8_t> token = std::vector(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
vocab[token] = vocab_values[i];
};

auto default_value = *inputs[7].data<const int32_t>();
const size_t num_elements = inputs[0].get_size();

// Set output shape
outputs[0].set_shape({num_elements});
auto token_ids = outputs[0].data<int32_t>();

for (size_t element_idx = 0; element_idx < num_elements; ++element_idx) {
auto element = vocab.find(std::vector(chars + begins[element_idx], chars + ends[element_idx]));
if (element == vocab.end()) {
token_ids[element_idx] = default_value;
} else {
token_ids[element_idx] = element->second;
};
};

return true;
}
38 changes: 38 additions & 0 deletions src/vocab_encoder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
//
#pragma once
#include <vector>
#include <openvino/op/op.hpp>
#include "openvino/opsets/opset13.hpp"

using namespace ov;
using namespace ov::opset13;


class VocabEncoder : public ov::op::Op {
public:
OPENVINO_OP("VocabEncoder");

VocabEncoder () = default;
VocabEncoder(
const ov::OutputVector& arguments
);

void validate_and_infer_types() override;

std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
return std::make_shared<VocabEncoder>(inputs);
}

bool visit_attributes(ov::AttributeVisitor& visitor) override {
return true;
}

bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;

bool has_evaluate() const override {
return true;
}
};
4 changes: 2 additions & 2 deletions src/wordpiece_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

#include "wordpiece_tokenizer.hpp"
#include "utils.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/opsets/opset13.hpp"

using namespace ov;
using namespace ov::opset10;
using namespace ov::opset13;


WordpieceTokenizer::WordpieceTokenizer(
Expand Down
Loading