Skip to content

Commit

Permalink
use BytesIO for performance on large vocabs, some other updates
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Dec 18, 2024
1 parent b92b864 commit cb9b0e0
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 50 deletions.
6 changes: 0 additions & 6 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,6 @@ def create_string_constant_node(value: Union[str, Iterable[str]]) -> List[Output
elif isinstance(value, Iterable):
# support only 1D strings for now
return create_unpacked_string(value)

# TODO: use direct creation of string constants when CVS-159581 will be fixed.
values = [bytes(string, "utf-8") if isinstance(string, str) else string for string in value]
str_constant = op.Constant(Type.string, [len(values), values])
return _get_opset_factory("opset15").create("StringTensorUnpack", str_constant.outputs())

else:
raise ValueError(f"Unsupported value type {type(value)}")

Expand Down
24 changes: 11 additions & 13 deletions python/openvino_tokenizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,26 +305,24 @@ def create_unpacked_string(strings: Iterable[str]) -> List[Output]:
"""
Convert any list of strings to U8/1D numpy array with begins, ends, and chars
"""
strings = list(strings)
if len(strings) == 0:
return [Constant(Tensor(np.array([], dtype=np.uint8))).output(0)]

begins = []
ends = []
chars = bytearray()
buffer = BytesIO()
buffer.write(to_bytes(len(strings)))
begins = BytesIO()
ends = BytesIO()
chars = BytesIO()
offset = 0

for string in strings:
byte_string = string.encode("utf-8") if isinstance(string, str) else string
length = len(byte_string)

begins.append(offset)
begins.write(to_bytes(offset))
offset += length
ends.append(offset)
chars.extend(byte_string)
ends.write(to_bytes(offset))
chars.write(byte_string)

begins = np.array(begins, dtype=np.int32)
ends = np.array(ends, dtype=np.int32)
chars = np.frombuffer(chars, dtype=np.uint8)
begins = np.frombuffer(begins.getvalue(), np.int32)
ends = np.frombuffer(ends.getvalue(), np.int32)
chars = np.frombuffer(chars.getvalue(), np.uint8)

return [Constant(Tensor(x)).output(0) for x in [begins, ends, chars]]
10 changes: 1 addition & 9 deletions src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
auto input_size = get_input_size();
const bool has_skips = (input_size == 7);

auto r = inputs[5 + has_skips].get_element_type();
std::string split_pattern;
if (inputs[5 + has_skips].get_element_type() == element::u8) {
split_pattern = std::string(inputs[5 + has_skips].data<const char>(), inputs[5 + has_skips].get_size());
} else if (inputs[5 + has_skips].get_element_type() == element::string) {
split_pattern = *inputs[5 + has_skips].data<std::string>();
} else {
OPENVINO_THROW("Unsupported split pattern type: " + inputs[5 + has_skips].get_element_type().get_type_name());
}
std::string split_pattern = std::string(inputs[5 + has_skips].data<const char>(), inputs[5 + has_skips].get_size());
auto pattern_size = inputs[5 + has_skips].get_size();

// Write to common trie structures should be protected to prevent race conditions.
Expand Down
8 changes: 1 addition & 7 deletions src/special_tokens_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,7 @@ void SpecialTokensSplit::validate_and_infer_types() {
}

bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
std::string split_pattern;
if (inputs[5].get_element_type() == element::string) {
split_pattern = *inputs[5].data<std::string>();
} else {
split_pattern = std::string(inputs[5].data<const char>(), inputs[5].get_size());
}

std::string split_pattern = std::string(inputs[5].data<const char>(), inputs[5].get_size());
compile_pattern_if_necessary(split_pattern);

auto input_size = get_input_size();
Expand Down
28 changes: 15 additions & 13 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,22 @@ void check_string_input(const Node* node, size_t input_index) {
void check_string_scalar_input(const Node* node, size_t input_index) {
auto shape = node->get_input_partial_shape(input_index);
auto element_type = node->get_input_element_type(input_index);


#if false && USE_STRING_TENSORS
// This block is not used when we convert ops to decomposed representation (and we really do)
if (element_type == element::string) {
OPENVINO_ASSERT((shape.rank().is_dynamic() || shape.rank().get_length() == 0),
"string/0D tensor is expected, but observed: ", element_type.get_type_name(), ", ", shape.to_string());
} else if (element_type == element::u8) {
OPENVINO_ASSERT((shape.rank().is_dynamic() || shape.rank().get_length() == 1),
"u8/1D tensor is expected, got element type ", element_type.to_string(), ", shape ", shape.to_string());
} else if (element_type == element::dynamic) {
OPENVINO_ASSERT((shape.rank().is_dynamic() || shape.rank().get_length() == 0 || shape.rank().get_length() == 1),
"dynamic tensor should be either scalar string or 1D u8 tensor", element_type.to_string(), ", shape ", shape.to_string());
} else {
OPENVINO_THROW("Unsupported split pattern type: " + node->get_input_element_type(5).get_type_name());
}
OPENVINO_ASSERT(
(element_type == element::dynamic || element_type == element::string) &&
(shape.rank().is_dynamic() || shape.rank().get_length() == 0),
"string/0D tensor is expected, but observed: ", element_type.get_type_name(), ", ", shape.to_string());

#else

OPENVINO_ASSERT(
(element_type == element::dynamic || element_type == element::u8) &&
(shape.rank().is_dynamic() || shape.rank().get_length() == 1),
"u8/1D tensor is expected, got element type ", element_type.to_string(), ", shape ", shape.to_string());

#endif

}

Expand Down
6 changes: 4 additions & 2 deletions src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
#include <pcre2.h>
#include "absl/strings/string_view.h"

#define OPENVINO_ELEMENT_STRING_SUPPORTED 1
#ifndef OPENVINO_ELEMENT_STRING_SUPPORTED
#define OPENVINO_ELEMENT_STRING_SUPPORTED 0
#endif

#ifndef OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK
#define OPENVINO_USE_INPUT_OUTPUT_STRING_TENSOR_HACK 0
#endif

#define USE_STRING_TENSORS 1 // modify this depending on willingness to use explicit string tensors
#define USE_STRING_TENSORS 0 // modify this depending on willingness to use explicit string tensors

#if USE_STRING_TENSORS && !OPENVINO_ELEMENT_STRING_SUPPORTED
#error "USE_STRING_TENSORS = 1 can be used only when OpenVINO supports element::string that is determined by OPENVINO_ELEMENT_STRING_SUPPORTED == 1"
Expand Down

0 comments on commit cb9b0e0

Please sign in to comment.