Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PRs From Comtrib #12

Merged
merged 3 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ project(openvino_tokenizers)
include(cmake/platforms.cmake)

# Find OpenVINODeveloperPackage first to compile with SDL flags
find_package(OpenVINODeveloperPackage QUIET)
find_package(OpenVINODeveloperPackage QUIET
PATHS "${OpenVINO_DIR}")
if(NOT OpenVINODeveloperPackage_FOUND)
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
endif()
Expand Down
5 changes: 4 additions & 1 deletion python/openvino_tokenizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,12 @@ def greedy_decoder(input) -> Model:
return token_ids.output(0)


def add_greedy_decoding(text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME) -> Model:
def add_greedy_decoding(
text_generation_model: Model, logits_output: str = LOGITS_OUTPUT_NAME, output_type: Type = Type.i64
) -> Model:
ppp = PrePostProcessor(text_generation_model)
ppp.output(logits_output).postprocess().custom(greedy_decoder)
ppp.output(logits_output).tensor().set_element_type(output_type)
model = ppp.build()
model.output(logits_output).tensor.set_names({TOKEN_IDS_OUTPUT_NAME})
return model
Expand Down
4 changes: 3 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ if(POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()

set(CMAKE_POSITION_INDEPENDENT_CODE ON)

option(BUILD_FAST_TOKENIZERS OFF)

#
Expand Down Expand Up @@ -222,4 +224,4 @@ if(extra_libs)
install(FILES ${extra_libs} DESTINATION ${extra_libs_location})
endif()

include (CPack)
include (CPack)
71 changes: 30 additions & 41 deletions src/sentence_piece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,21 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
}

void SentencepieceTokenizer::validate_and_infer_types() {

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");

#else

FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");

FRONT_END_GENERAL_CHECK(
// WA: sometimes f32 appeared as a placeholder for unknown type
get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");

#endif
auto input_size = get_input_size();
if(input_size == 2) {
FRONT_END_GENERAL_CHECK(
// WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
} else if (input_size == 4) {
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
} else {
OPENVINO_THROW("Unexpected input format. SentencepieceTokenizer accepts one string input or three decomposed string inputs (begins, ends, symbols)");
};

// The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
// and dense shape
Expand All @@ -133,17 +128,7 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
std::vector<int32_t> sparse_values;
std::vector<int64_t> sparse_dense_shape;

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();

auto batch_size = shape_size(inputs[1].get_shape());

#else

auto input_element_type = get_input_element_type(1);
auto input_size = get_input_size();
int32_t batch_size;

// used in case of string tensors
Expand All @@ -154,27 +139,31 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
const int32_t* end_ids;
const uint8_t* data;

if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else if(input_element_type == ov::element::u8) {
parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
if (input_size == 2) {
auto input_element_type = get_input_element_type(1);
if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}

#endif
auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();
batch_size = shape_size(inputs[1].get_shape());
};

size_t max_token_id = 0;
for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
absl::string_view sentence;
if(input_element_type == ov::element::string) {
if (input_size == 2) {
sentence = strings[batch_ind];
} else if(input_element_type == ov::element::u8) {
} else {
auto begin_ind = begin_ids[batch_ind];
auto end_ind = end_ids[batch_ind];
sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
}
};

std::vector<int32_t> ids;
CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));
Expand Down
26 changes: 1 addition & 25 deletions src/tensorflow_translators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto sp_model_const = as_type_ptr<Constant>(sp_tokenize_op->input_value(0).get_node_shared_ptr());
FRONT_END_GENERAL_CHECK(sp_model_const, "Conversion expects SentencePiece model to be constant.");

// prepare input six inputs
// prepare input
auto inputs = sp_tokenize_op->input_value(1);

// extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
Expand All @@ -70,27 +70,8 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
auto add_eos = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(5).get_node_shared_ptr(), "add_eos");
auto reverse = extract_scalar_const_value<bool>(sp_tokenize_op->input_value(6).get_node_shared_ptr(), "reverse");

#if !USE_STRING_TENSORS
// Override type of input tensor if this is a Parameter
if (auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr())) {
parameter->set_partial_shape(PartialShape{ Dimension() });
parameter->set_element_type(element::u8);
parameter->validate_and_infer_types();
}
#endif

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

OutputVector inputs_vector = OutputVector{ sp_model_const };
auto unpacked_outputs = std::make_shared<StringTensorUnpack>(OutputVector{inputs}, "begins_ends")->outputs();
inputs_vector.insert(inputs_vector.end(), unpacked_outputs.begin(), unpacked_outputs.end());

#else

OutputVector inputs_vector = OutputVector{ sp_model_const, inputs };

#endif

// create a node with custom operation
auto sp_tokenizer_ext = std::make_shared<SentencepieceTokenizer>(inputs_vector, nbest_size, alpha, add_bos, add_eos, reverse);
FRONT_END_GENERAL_CHECK(sp_tokenizer_ext->get_output_size() == 3,
Expand Down Expand Up @@ -182,7 +163,6 @@ ov::OutputVector translate_lookup_table_find_v2(const ov::frontend::NodeContext&

auto wp_tokenizer_inputs = wp_tokenizer->input_values();
wp_tokenizer_inputs.push_back(unk_token_id);
//std::cerr << "Added extra input, total number of inputs is " << wp_tokenizer_inputs.size() << "\n";

auto new_wp_tokenizer = wp_tokenizer->clone_with_new_inputs(wp_tokenizer_inputs);
return { post_translate_ragged_tensor_output(new_wp_tokenizer->outputs()) };
Expand All @@ -209,7 +189,6 @@ ov::OutputVector translate_reshape(const ov::frontend::NodeContext& node) {
auto reshape = std::make_shared<Reshape>(tensor, shape, false);
return {reshape};
}
// set_node_name(node.get_name(), reshape); // TODO: requires dependencies from TF FE internals
}

// Copied and pasted from TF FE and adopted to not use internal TF FE operation classes
Expand All @@ -232,9 +211,7 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
const_node = std::make_shared<ov::op::util::FrameworkNode>(OutputVector{});
}
} else {
//static std::vector<ov::Tensor> tensors;
auto tensor = node.get_attribute<ov::Tensor>("value");
//tensors.push_back(tensor);
const_node = std::make_shared<Constant>(tensor);
#if OPENVINO_ELEMENT_STRING_SUPPORTED
if (const_node->get_element_type() == element::string) {
Expand All @@ -246,6 +223,5 @@ ov::OutputVector translate_const(const ov::frontend::NodeContext& node) {
}
#endif
}
//set_node_name(node.get_name(), const_node); // TODO: Provide alternative to internal function set_node_name
return {const_node};
}
Loading