Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tokenizers][TF FE] Fix MUSE conversion #854

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

#include "wordpiece_tokenizer.hpp"

#include <fstream>
#include <iterator>

using namespace TemplateExtension;
using namespace ov;
using namespace ov::frontend;
Expand All @@ -36,14 +39,35 @@ namespace {

OutputVector translate_sentencepiece_op(const NodeContext& node) {
// extract model to configure SentencePieceTokenizer
auto sp_model_ov_any = node.get_attribute_as_any("model");
FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
"SentencePieceOp configuration model is in incorrect format");
auto str_spm_model = sp_model_ov_any.as<std::string>();
// std::cout << "[ Trace 1 ] Before" << std::endl;
// auto sp_model_ov_any = node.get_attribute_as_any("model");
// std::cout << "[ Trace 1 ] Get Model" << std::endl;
// FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
// "SentencePieceOp configuration model is in incorrect format");
std::ifstream input( "/home/apaniuko/.config/JetBrains/RemoteDev-PY/_home_apaniuko_python_openvino_contrib/scratches/bytes", std::ios::binary );
std::vector<unsigned char> str_spm_model(std::istreambuf_iterator<char>(input), {});
std::cout << "[ Trace 1 ] FE Check" << std::endl;

// auto str_spm_model = sp_model_ov_any.as<std::vector<uint32_t>>();
std::cout << "[ Trace 1 ] As string" << std::endl;
// str_spm_model = str_spm_model.substr(2);
// str_spm_model = str_spm_model.substr(0, str_spm_model.size() - 1);
auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
// std::cout << "[ Trace 1 ] Successful size:"<< str_spm_model.size() << "\n" << str_spm_model.substr(0, 100) << std::endl;
std::cout << "[ Trace 1 ] Successful" << std::endl;
return { sp_model_const };
}

//OutputVector translate_sentencepiece_op(const NodeContext& node) {
// // extract model to configure SentencePieceTokenizer
// auto sp_model_ov_any = node.get_attribute_as_any("model");
// FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
// "SentencePieceOp configuration model is in incorrect format");
// auto str_spm_model = sp_model_ov_any.as<std::string>();
// auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
// return { sp_model_const };
//}

NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
// this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer,
// and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp
Expand All @@ -60,8 +84,10 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto sp_model_const = as_type_ptr<Constant>(sp_tokenize_op->input_value(0).get_node_shared_ptr());
FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant.");

// prepare input six inputs
// prepare input
auto inputs = sp_tokenize_op->input_value(1);
auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr());
parameter -> set_partial_shape(PartialShape{ Dimension() });

// extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
auto nbest_size = extract_scalar_const_value<int32_t>(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size");
Expand All @@ -70,27 +96,8 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto add_eos = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos");
auto reverse = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse");

#if !USE_STRING_TENSORS
// Override type of input tensor if this is a Parameter
if (auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr())) {
parameter->set_partial_shape(PartialShape{ Dimension() });
parameter->set_element_type(element::u8);
parameter->validate_and_infer_types();
}
#endif

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

OutputVector inputs_vector = OutputVector{ sp_model_const };
auto unpacked_outputs = std::make_shared<StringTensorUnpack>(OutputVector{inputs}, "begins_ends")->outputs();
inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end());

#else

OutputVector inputs_vector = OutputVector{ sp_model_const, inputs };

#endif

// create a node with custom operation
auto sp_tokenizer_ext = std::make_shared<SentencepieceTokenizer>(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse);
FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3,
Expand Down Expand Up @@ -197,18 +204,9 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) {
FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "Tensorflow Reshape op should have two inputs");
auto tensor = node.get_input(0);
auto shape = node.get_input(1);
if(auto pack = dynamic_cast<StringTensorPack*>(tensor.get_node())) {
// TODO: If it is a beginning of the graph, how to detect strings? It falls in 'else' branch in this case.
// FIXME: Needs extension for a Parameter to prepare it first
auto begins = std::make_shared<Reshape>(pack->input_value(0), shape, false);
auto ends = std::make_shared<Reshape>(pack->input_value(1), shape, false);
auto chars = pack->input_value(2);
auto reshape = post_translate_string_tensor_output({begins, ends, chars});
return {reshape};
} else {
auto reshape = std::make_shared<Reshape>(tensor, shape, false);
return {reshape};
}
auto reshape = std::make_shared<Reshape>(tensor, shape, false);
return {reshape};
// }
// set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this? Please clean the code a bit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deleted

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should revert these changes, they does not relate MUSE model

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverted.

}

Expand Down
Loading