[Tokenizers][TF FE] Fix MUSE conversion (#854)

* Fix MUSE conversion * Fix MUSE conversion * Add Type Argument To Greedy Decoding * Del PackedString Representation for Sentencepiece * Del includes * Del vars for packed strings * Revert Reshape Translator and Clean Up Unused Code * Add Decomposed Strings Input Back
openvinotoolkit · Feb 8, 2024 · 60c3035 · 60c3035
1 parent d219e31
commit 60c3035
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 66 deletions.
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp
@@ -91,26 +91,21 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
 }
 
 void SentencepieceTokenizer::validate_and_infer_types() {
-
-    #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
-
-    #else
-
-    FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
     FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
 
-    FRONT_END_GENERAL_CHECK(
-        // WA: sometimes f32 appeared as a placeholder for unknown type
-        get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
-        "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
-
-    #endif
+    auto input_size = get_input_size();
+    if(input_size == 2) {
+        FRONT_END_GENERAL_CHECK(
+            // WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
+            get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
+            "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
+    } else if (input_size == 4) {
+        FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
+        FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
+        FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
+    } else {
+        OPENVINO_THROW("Unexpected input format. SentencepieceTokenizer accepts one string input or three decomposed string inputs (begins, ends, symbols)");
+    };
 
     // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
     // and dense shape
@@ -133,17 +128,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     std::vector<int32_t> sparse_values;
     std::vector<int64_t> sparse_dense_shape;
 
-#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    auto begin_ids = inputs[1].data<const int32_t>();
-    auto end_ids = inputs[2].data<const int32_t>();
-    auto data = inputs[3].data<const uint8_t>();
-
-    auto batch_size = shape_size(inputs[1].get_shape());
-
-#else
-
-    auto input_element_type = get_input_element_type(1);
+    auto input_size = get_input_size();
     int32_t batch_size;
 
     // used in case of string tensors
@@ -154,27 +139,31 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     const int32_t* end_ids;
     const uint8_t* data;
 
-    if(input_element_type == ov::element::string) {
-        strings = inputs[1].data<const std::string>();
-        batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
-    } else if(input_element_type == ov::element::u8) {
-        parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
+    if (input_size == 2) {
+        auto input_element_type = get_input_element_type(1);
+        if(input_element_type == ov::element::string) {
+            strings = inputs[1].data<const std::string>();
+            batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
+        } else {
+            OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
+        }
     } else {
-        OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
-    }
-
-#endif
+        auto begin_ids = inputs[1].data<const int32_t>();
+        auto end_ids = inputs[2].data<const int32_t>();
+        auto data = inputs[3].data<const uint8_t>();
+        batch_size = shape_size(inputs[1].get_shape());
+    };
 
     size_t max_token_id = 0;
     for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
         absl::string_view sentence;
-        if(input_element_type == ov::element::string) {
+        if (input_size == 2) {
             sentence = strings[batch_ind];
-        } else if(input_element_type == ov::element::u8) {
+        } else {
             auto begin_ind = begin_ids[batch_ind];
             auto end_ind = end_ids[batch_ind];
             sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
-        }
+        };
 
         std::vector<int32_t> ids;
         CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp
@@ -60,7 +60,7 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
     auto sp_model_const = as_type_ptr<Constant>(sp_tokenize_op->input_value(0).get_node_shared_ptr());
     FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant.");
 
-    // prepare input six inputs
+    // prepare input
     auto inputs = sp_tokenize_op->input_value(1);
 
     // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
@@ -70,27 +70,8 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
     auto add_eos = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos");
     auto reverse = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse");
 
-#if !USE_STRING_TENSORS
-    // Override type of input tensor if this is a Parameter
-    if (auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr())) {
-        parameter->set_partial_shape(PartialShape{ Dimension() });
-        parameter->set_element_type(element::u8);
-        parameter->validate_and_infer_types();
-    }
-#endif
-
-#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    OutputVector inputs_vector = OutputVector{ sp_model_const };
-    auto unpacked_outputs = std::make_shared<StringTensorUnpack>(OutputVector{inputs}, "begins_ends")->outputs();
-    inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end());
-
-#else
-
     OutputVector inputs_vector = OutputVector{ sp_model_const, inputs };
 
-#endif
-
     // create a node with custom operation
     auto sp_tokenizer_ext = std::make_shared<SentencepieceTokenizer>(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse);
     FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3,
@@ -182,7 +163,6 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext&
 
     auto wp_tokenizer_inputs = wp_tokenizer->input_values();
     wp_tokenizer_inputs.push_back(unk_token_id);
-    //std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n";
 
     auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs);
     return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) };
@@ -209,7 +189,6 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) {
         auto reshape = std::make_shared<Reshape>(tensor, shape, false);
         return {reshape};
     }
-    // set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals
 }
 
 // Copied and pasted from TF FE and adopted to not use internal TF FE operation classes
@@ -232,9 +211,7 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
             const_node = std::make_shared<ov::op::util::FrameworkNode>(OutputVector{});
         }
     } else {
-        //static std::vector<ov::Tensor> tensors;
         auto tensor = node.get_attribute<ov::Tensor>("value");
-        //tensors.push_back(tensor);
         const_node = std::make_shared<Constant>(tensor);
         #if OPENVINO_ELEMENT_STRING_SUPPORTED
         if (const_node->get_element_type() == element::string) {
@@ -246,6 +223,5 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
         }
         #endif
     }
-    //set_node_name(node.get_name(), const_node);   // TODO: Provide alternative to internal function set_node_name
     return {const_node};
 }