Skip to content

Commit

Permalink
[Tokenizers][TF FE] Fix MUSE conversion (#854)
Browse files Browse the repository at this point in the history
* Fix MUSE conversion

* Fix MUSE conversion

* Add Type Argument To Greedy Decoding

* Del PackedString Representation for Sentencepiece

* Del includes

* Del vars for packed strings

* Revert Reshape Translator and Clean Up Unused Code

* Add Decomposed Strings Input Back
  • Loading branch information
apaniukov authored Feb 8, 2024
1 parent d219e31 commit 60c3035
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,21 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
}

void SentencepieceTokenizer::validate_and_infer_types() {

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");

#else

FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");

FRONT_END_GENERAL_CHECK(
// WA: sometimes f32 appeared as a placeholder for unknown type
get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");

#endif
auto input_size = get_input_size();
if(input_size == 2) {
FRONT_END_GENERAL_CHECK(
// WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
} else if (input_size == 4) {
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
} else {
OPENVINO_THROW("Unexpected input format. SentencepieceTokenizer accepts one string input or three decomposed string inputs (begins, ends, symbols)");
};

// The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
// and dense shape
Expand All @@ -133,17 +128,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
std::vector<int32_t> sparse_values;
std::vector<int64_t> sparse_dense_shape;

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();

auto batch_size = shape_size(inputs[1].get_shape());

#else

auto input_element_type = get_input_element_type(1);
auto input_size = get_input_size();
int32_t batch_size;

// used in case of string tensors
Expand All @@ -154,27 +139,31 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
const int32_t* end_ids;
const uint8_t* data;

if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else if(input_element_type == ov::element::u8) {
parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
if (input_size == 2) {
auto input_element_type = get_input_element_type(1);
if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}

#endif
auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();
batch_size = shape_size(inputs[1].get_shape());
};

size_t max_token_id = 0;
for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
absl::string_view sentence;
if(input_element_type == ov::element::string) {
if (input_size == 2) {
sentence = strings[batch_ind];
} else if(input_element_type == ov::element::u8) {
} else {
auto begin_ind = begin_ids[batch_ind];
auto end_ind = end_ids[batch_ind];
sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
}
};

std::vector<int32_t> ids;
CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto sp_model_const = as_type_ptr<Constant>(sp_tokenize_op->input_value(0).get_node_shared_ptr());
FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant.");

// prepare input six inputs
// prepare input
auto inputs = sp_tokenize_op->input_value(1);

// extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
Expand All @@ -70,27 +70,8 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto add_eos = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos");
auto reverse = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse");

#if !USE_STRING_TENSORS
// Override type of input tensor if this is a Parameter
if (auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr())) {
parameter->set_partial_shape(PartialShape{ Dimension() });
parameter->set_element_type(element::u8);
parameter->validate_and_infer_types();
}
#endif

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

OutputVector inputs_vector = OutputVector{ sp_model_const };
auto unpacked_outputs = std::make_shared<StringTensorUnpack>(OutputVector{inputs}, "begins_ends")->outputs();
inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end());

#else

OutputVector inputs_vector = OutputVector{ sp_model_const, inputs };

#endif

// create a node with custom operation
auto sp_tokenizer_ext = std::make_shared<SentencepieceTokenizer>(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse);
FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3,
Expand Down Expand Up @@ -182,7 +163,6 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext&

auto wp_tokenizer_inputs = wp_tokenizer->input_values();
wp_tokenizer_inputs.push_back(unk_token_id);
//std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n";

auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs);
return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) };
Expand All @@ -209,7 +189,6 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) {
auto reshape = std::make_shared<Reshape>(tensor, shape, false);
return {reshape};
}
// set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals
}

// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes
Expand All @@ -232,9 +211,7 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
const_node = std::make_shared<ov::op::util::FrameworkNode>(OutputVector{});
}
} else {
//static std::vector<ov::Tensor> tensors;
auto tensor = node.get_attribute<ov::Tensor>("value");
//tensors.push_back(tensor);
const_node = std::make_shared<Constant>(tensor);
#if OPENVINO_ELEMENT_STRING_SUPPORTED
if (const_node->get_element_type() == element::string) {
Expand All @@ -246,6 +223,5 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
}
#endif
}
//set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name
return {const_node};
}

0 comments on commit 60c3035

Please sign in to comment.