Add String Mapping Op (#42)

* Add VocabEncoder * Lower Pass Rate Due To Tokenizer Type Change
openvinotoolkit · Mar 1, 2024 · c89b9fa · c89b9fa
1 parent a1afb67
commit c89b9fa
Show file tree

Hide file tree

Showing 17 changed files with 173 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -275,7 +275,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >Tiktoken</td>
-      <td >97.25</td>
+      <td >96.94</td>
       <td >327</td>
     </tr>
     <tr>
@@ -547,7 +547,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >Tiktoken</td>
       <td >stabilityai/stablelm-2-1_6b</td>
-      <td >96.33</td>
+      <td >95.41</td>
       <td >109</td>
     </tr>
     <tr>

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -274,10 +274,6 @@ def byte_level_splitter(cls) -> "RegexSplitStep":
             behaviour="isolate",
         )
 
-    @classmethod
-    def add_whitespace_to_the_next_word(cls):
-        return cls(r"\s\S", invert=False, behaviour="merge_with_next")
-
     @classmethod
     def digits_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
         return cls(
@@ -338,6 +334,31 @@ class TokenizationModelStep(BasePipelineStep):
     pass
 
 
+@dataclass
+class VocabEncoderStep(TokenizationModelStep):
+    vocab: List[str] = field(repr=False)
+    vocab_values: Optional[List[int]] = None
+    default_value: int = -1
+
+    def __post_init__(self) -> None:
+        if self.vocab_values is None:
+            self.vocab_values = list(range(len(self.vocab)))
+
+    def get_vocab_node_outputs(self) -> Optional[List[Output]]:
+        return self.get_pipeline().vocab_node_outputs
+
+    def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
+        self.create_string_constant_node(self.vocab).outputs()
+        input_nodes.extend(
+            (
+                *self.create_string_constant_node(self.vocab).outputs(),
+                make_constant_node(np.array(self.vocab_values, dtype=np.int32), Type.i32),
+                make_constant_node(self.default_value, Type.i32)  # default_value
+            )
+        )
+        return _get_factory().create("VocabEncoder", input_nodes).outputs()
+
+
 @dataclass
 class WordPieceTokenizationStep(TokenizationModelStep):
     vocab: List[str] = field(repr=False)

diff --git a/src/bpe_tokenizer.cpp b/src/bpe_tokenizer.cpp
@@ -4,10 +4,10 @@
 
 #include "bpe_tokenizer.hpp"
 #include "utils.hpp"
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 
 using namespace ov;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 #undef tokenizer
 

diff --git a/src/ov_extension.cpp b/src/ov_extension.cpp
@@ -39,6 +39,7 @@ OPENVINO_CREATE_EXTENSIONS(
             std::make_shared<ov::OpExtension<BytesToChars>>(),
             std::make_shared<ov::OpExtension<CombineSegments>>(),
             std::make_shared<ov::OpExtension<RaggedToDense>>(),
+            std::make_shared<ov::OpExtension<VocabEncoder>>(),
             std::make_shared<ov::OpExtension<VocabDecoder>>(),
             std::make_shared<ov::OpExtension<CharsToBytes>>(),
             std::make_shared<ov::OpExtension<TemplateExtension::SentencepieceTokenizer>>(),

diff --git a/src/regex_normalization.hpp b/src/regex_normalization.hpp
@@ -7,11 +7,11 @@
 #include "normalizer.h" // for absl::string_view
 
 #include <openvino/op/op.hpp>
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 #include "fast_tokenizer/normalizers/normalizers.h"
 
 using namespace ov;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 class RegexNormalization : public ov::op::Op {
 public:

diff --git a/src/regex_split.cpp b/src/regex_split.cpp
@@ -3,14 +3,14 @@
 //
 
 #include "openvino/op/util/framework_node.hpp"
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 
 #include "regex_split.hpp"
 #include "utils.hpp"
 #include "fast_tokenizer/normalizers/normalizers.h"
 
 using namespace ov;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 namespace {
 

diff --git a/src/regex_split.hpp b/src/regex_split.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <openvino/op/op.hpp>
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 #include "fast_tokenizer/pretokenizers/pretokenizers.h"
 
 using namespace ov;

diff --git a/src/sentence_piece.cpp b/src/sentence_piece.cpp
@@ -8,7 +8,7 @@
 #include "model_interface.h"
 
 #include "openvino/op/util/framework_node.hpp"
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 
 #include "sentence_piece.hpp"
 #include "utils.hpp"
@@ -18,7 +18,7 @@ using sentencepiece::util::Status;
 using namespace TemplateExtension;
 using namespace ov;
 using namespace ov::frontend;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 // TODO: Replace shape_size(t.get_shape()) by t.get_size(), where t is ov::Tensor
 

diff --git a/src/tensorflow_translators.cpp b/src/tensorflow_translators.cpp
@@ -3,7 +3,7 @@
 //
 
 #include "openvino/op/util/framework_node.hpp"
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 
 #include "tensorflow_translators.hpp"
 #include "utils.hpp"
@@ -21,7 +21,7 @@
 using namespace TemplateExtension;
 using namespace ov;
 using namespace ov::frontend;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 namespace {
     template<typename T>

diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp
@@ -17,6 +17,7 @@
 #include "wordpiece_tokenizer.hpp"
 #include "bpe_tokenizer.hpp"
 #include "ragged_to_dense.hpp"
+#include "vocab_encoder.hpp"
 #include "vocab_decoder.hpp"
 #include "chars_to_bytes.hpp"
 

diff --git a/src/utils.cpp b/src/utils.cpp
@@ -3,15 +3,15 @@
 //
 
 #include "openvino/op/util/framework_node.hpp"
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 #include "utils.hpp"
 #include "string_tensor_pack.hpp"
 #include "string_tensor_unpack.hpp"
 #include "ragged_tensor_pack.hpp"
 
 using namespace ov;
 using namespace ov::frontend;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 void parse_packed_strings (const Tensor& packed, int32_t& batch_size, const int32_t*& begin_ids, const int32_t*& end_ids, const uint8_t*& symbols) {
     auto strings = packed.data<const uint8_t>();

diff --git a/src/vocab_decoder.cpp b/src/vocab_decoder.cpp
@@ -30,16 +30,8 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
     auto vocab_chars  = inputs[3].data<const uint8_t>();
     auto vocab_size   = inputs[1].get_size();
 
-    std::vector<std::vector<uint8_t>> vocab;
-    vocab.resize(vocab_size);
-
-    std::vector<uint8_t> empty = {};
-
     OPENVINO_ASSERT(inputs.size() == 4, "Too few inputs passed to VocabDecoder, it means it is not converted properly or it is not used in the supported pattern");
 
-    for(size_t id = 0; id < vocab_size; ++id) {
-        vocab[id] = std::vector<uint8_t>(vocab_chars + vocab_begins[id], vocab_chars + vocab_ends[id]);
-    }
     // Set output shapes
     outputs[0].set_shape({batch_size});
     outputs[1].set_shape({batch_size});
@@ -62,17 +54,20 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
 
         for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) {
             auto token_id = input_data[seq];
-            std::vector<uint8_t> token;
-            if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
-                token = vocab[token_id];
-            } else {
-                token = empty;
+            int token_size = 0;
+            if (token_id >= vocab_size) {
+                OPENVINO_THROW("Token id is greater then vocabulary size.");
+            } else if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
+                std::copy(
+                    vocab_chars + vocab_begins[token_id],
+                    vocab_chars + vocab_ends[token_id],
+                    &new_chars[char_offset]
+                );
+                token_size = vocab_ends[token_id] - vocab_begins[token_id];
             }
 
-            std::copy(token.begin(), token.end(), &new_chars[char_offset]);
-
             new_begins[seq] = char_offset;
-            char_offset += token.size();
+            char_offset += token_size;
             new_ends[seq] = char_offset;
         }
     }

diff --git a/src/vocab_decoder.hpp b/src/vocab_decoder.hpp
@@ -15,8 +15,7 @@ class VocabDecoder : public ov::op::Op {
         const ov::OutputVector& arguments,
         std::vector<int> skip_tokens
     ) :
-        ov::op::Op(arguments) {
-        m_skip_tokens = skip_tokens;
+        ov::op::Op(arguments), m_skip_tokens(skip_tokens) {
         constructor_validate_and_infer_types();
     }
 

diff --git a/src/vocab_encoder.cpp b/src/vocab_encoder.cpp
@@ -0,0 +1,78 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+//
+#ifdef _MSC_VER
+#    pragma warning(disable : 4251)
+#    pragma warning(disable : 4275)
+#endif
+
+#include "vocab_encoder.hpp"
+#include "utils.hpp"
+
+using namespace ov;
+
+
+VocabEncoder::VocabEncoder (const ov::OutputVector& arguments) :
+    ov::op::Op(arguments) {
+    constructor_validate_and_infer_types();
+}
+
+
+void VocabEncoder::validate_and_infer_types() {
+    // main string input
+    check_string_input(this, 0);
+    // vocab keys
+    check_string_input(this, 3);
+    // vocab values
+    FRONT_END_GENERAL_CHECK(this->get_input_element_type(6) == element::i32, "Expected an i32 tensor for VocabEncode values.");
+    // vocab.size == vocab_values.size when vocab is static
+    FRONT_END_GENERAL_CHECK(
+        this->get_input_partial_shape(3).is_dynamic() || this->get_input_partial_shape(3) == this->get_input_partial_shape(6),
+        "Expected equal number of vocab keys and values."
+    );
+    // Default value is compatible to vocab values
+    FRONT_END_GENERAL_CHECK(get_input_element_type(6).compatible(get_input_element_type(7)));
+    // one data output, reuse ragged dimensions from split
+    this->set_output_type(0, element::i32, get_input_partial_shape(0));
+}
+
+
+bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
+    // string inputs
+    auto begins = inputs[0].data<const int32_t>();
+    auto ends   = inputs[1].data<const int32_t>();
+    auto chars  = inputs[2].data<const uint8_t>();
+
+    // vocab string keys
+    auto vocab_begins = inputs[3].data<const int32_t>();
+    auto vocab_ends   = inputs[4].data<const int32_t>();
+    auto vocab_chars  = inputs[5].data<const uint8_t>();
+
+    auto vocab_values = inputs[6].data<const int32_t>();
+    auto vocab_size = inputs[6].get_size();
+
+    std::map<std::vector<uint8_t>, int32_t> vocab;
+    for (size_t i = 0; i < vocab_size; ++i) {
+        std::vector<uint8_t> token = std::vector(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
+        vocab[token] = vocab_values[i];
+    };
+
+    auto default_value = *inputs[7].data<const int32_t>();
+    const size_t num_elements = inputs[0].get_size();
+
+    // Set output shape
+    outputs[0].set_shape({num_elements});
+    auto token_ids = outputs[0].data<int32_t>();
+
+    for (size_t element_idx = 0; element_idx < num_elements; ++element_idx) {
+        auto element = vocab.find(std::vector(chars + begins[element_idx], chars + ends[element_idx]));
+        if (element == vocab.end()) {
+            token_ids[element_idx] = default_value;
+        } else {
+            token_ids[element_idx] = element->second;
+        };
+    };
+
+    return true;
+}
diff --git a/src/vocab_encoder.hpp b/src/vocab_encoder.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+//
+#pragma once
+#include <vector>
+#include <openvino/op/op.hpp>
+#include "openvino/opsets/opset13.hpp"
+
+using namespace ov;
+using namespace ov::opset13;
+
+
+class VocabEncoder : public ov::op::Op {
+public:
+    OPENVINO_OP("VocabEncoder");
+
+    VocabEncoder () = default;
+    VocabEncoder(
+        const ov::OutputVector& arguments
+    );
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
+        return std::make_shared<VocabEncoder>(inputs);
+    }
+
+    bool visit_attributes(ov::AttributeVisitor& visitor) override {
+        return true;
+    }
+
+    bool evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const override;
+
+    bool has_evaluate() const override {
+        return true;
+    }
+};
diff --git a/src/wordpiece_tokenizer.cpp b/src/wordpiece_tokenizer.cpp
@@ -4,10 +4,10 @@
 
 #include "wordpiece_tokenizer.hpp"
 #include "utils.hpp"
-#include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset13.hpp"
 
 using namespace ov;
-using namespace ov::opset10;
+using namespace ov::opset13;
 
 
 WordpieceTokenizer::WordpieceTokenizer(

diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tokenizers_test.py::test_": 0.8700921600807978
+    "tokenizers_test.py::test_": 0.8699659133947734
 }