Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use opset15 version of Str Pack/Unpack #351

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions python/openvino_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def new_fe_init(self, *args, **kwargs):


openvino.runtime.Core.__init__ = new_core_init
openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
openvino.frontend.frontend.FrontEnd.__init__ = new_fe_init


Expand All @@ -76,6 +75,21 @@ def _get_factory_callable() -> Callable[[], NodeFactory]:
def inner(opset_version: Optional[str] = None) -> NodeFactory:
nonlocal factory
if factory.get(opset_version, False) == False:
openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)

return factory[opset_version]

return inner

def _get_opset_factory_callable() -> Callable[[], NodeFactory]:
# factory without extensions
factory = {}

def inner(opset_version: Optional[str] = None) -> NodeFactory:
nonlocal factory
if factory.get(opset_version, False) == False:
openvino.runtime.utils.node_factory.NodeFactory.__init__ = old_factory_init
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)

return factory[opset_version]
Expand All @@ -84,10 +98,10 @@ def inner(opset_version: Optional[str] = None) -> NodeFactory:


_get_factory = _get_factory_callable()
_get_opset_factory = _get_opset_factory_callable()

# some files uses _get_factory function
from .__version__ import __version__ # noqa
from .build_tokenizer import build_rwkv_tokenizer # noqa
from .convert_tokenizer import convert_tokenizer # noqa
from .str_pack import pack_strings, unpack_strings # noqa
from .utils import add_greedy_decoding, connect_models # noqa
6 changes: 3 additions & 3 deletions python/openvino_tokenizers/build_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def build_rwkv_tokenizer(
tokenizer_output_type: Type = Type.i64,
detokenizer_input_type: Type = Type.i64,
) -> Tuple[Model, Model]:
from openvino_tokenizers import _get_factory
from openvino_tokenizers import _get_factory, _get_opset_factory

input_node = op.Parameter(Type.string, PartialShape(["?"]))
input_node.set_friendly_name("string_input")

output = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
output = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
trie_node = TrieTokenizerStep.from_rwkv_vocab(rwkv_vocab)
output = trie_node.get_ov_subgraph(TokenizerPipeline.add_ragged_dimension(output))

Expand Down Expand Up @@ -65,7 +65,7 @@ def build_rwkv_tokenizer(
if clean_up_tokenization_spaces:
RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer_output)

detokenizer_output = _get_factory().create("StringTensorPack", detokenizer_output).outputs()
detokenizer_output = _get_opset_factory("opset15").create("StringTensorPack", detokenizer_output).outputs()
detokenizer_output[0].tensor.add_names({STRING_OUTPUT_NAME})

detokenizer = Model(detokenizer_output, [detokenizer_input], DETOKENIZER_NAME)
Expand Down
4 changes: 2 additions & 2 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.convert_slow_tokenizer import import_protobuf

from . import _get_factory
from . import _get_factory, _get_opset_factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
Expand Down Expand Up @@ -985,7 +985,7 @@ def get_sp_detokenizer(
if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)

string_output = _get_factory().create("StringTensorPack", last_sinks).outputs()
string_output = _get_opset_factory("opset15").create("StringTensorPack", last_sinks).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_detokenizer = Model(string_output, [model_input], DETOKENIZER_NAME)
tokenizer_detokenizer.validate_nodes_and_infer_types()
Expand Down
62 changes: 0 additions & 62 deletions python/openvino_tokenizers/str_pack.py

This file was deleted.

25 changes: 14 additions & 11 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import numpy as np
from openvino.runtime import Model, Output, PartialShape, Type, op, Shape
from openvino.runtime import Model, Output, PartialShape, Type, op, Shape, Tensor
from openvino.runtime import opset12 as opset
from openvino.runtime.exceptions import OVTypeError, UserInputError
from openvino.runtime.utils.types import as_node, make_constant_node

from . import _get_factory
from . import _get_factory, _get_opset_factory

from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
Expand All @@ -31,8 +32,7 @@
VOCAB_SIZE_CACHE_PROPORTION,
UTF8ReplaceMode,
)
from .str_pack import pack_string, pack_strings
from .utils import apply_unicode_to_bytes, generate_tokens_with_space_symbols, has_incompatible_re2_op, quote_meta
from .utils import apply_unicode_to_bytes, generate_tokens_with_space_symbols, has_incompatible_re2_op, quote_meta, create_unpacked_string


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -69,12 +69,15 @@ def get_ov_subgraph(self, *input_nodes: List[Output]) -> List[Output]:
def create_string_constant_node(value: Union[str, Iterable[str]]) -> op.Constant:
if isinstance(value, str):
# string scalar
ps = pack_string(value)
return op.Constant(ps)
else:
return op.Constant(np.frombuffer(bytes(value, "utf-8"), dtype=np.uint8))
# return op.Constant(Tensor(np.array(value)))
elif isinstance(value, Iterable):
# support only 1D strings for now
ps = pack_strings(value)
return _get_factory().create("StringTensorUnpack", op.Constant(ps).outputs())
return create_unpacked_string(value)
# TODO: use direct creation of string constants when CVS-159581 will be fixed.
# return _get_opset_factory("opset15").create("StringTensorUnpack", create_str_constant(value).outputs())
else:
raise ValueError(f"Unsupported value type {type(value)}")

def finalize(self) -> None:
"""Called after the entire pipeline has been built"""
Expand Down Expand Up @@ -1201,7 +1204,7 @@ def get_tokenizer_ov_subgraph(self) -> Model:

processing_outputs = []
for input_node in string_inputs:
input_node = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
input_node = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()

ragged = []
if isinstance(self.steps[0], SpecialTokensSplit):
Expand Down Expand Up @@ -1274,7 +1277,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
pipeline_step = step.get_ov_subgraph(input_nodes)
input_nodes = pipeline_step

return _get_factory().create("StringTensorPack", input_nodes).outputs()
return _get_opset_factory("opset15").create("StringTensorPack", input_nodes).outputs()

def get_detokenizer_ov_subgraph(self) -> Model:
self.finalize()
Expand Down
71 changes: 69 additions & 2 deletions python/openvino_tokenizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@
import re
from dataclasses import dataclass, fields, field
from functools import lru_cache
from typing import Any, Dict, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Optional, Sequence, Tuple, Union, Iterable, List
import numpy as np
from numpy.typing import NDArray
from io import BytesIO


import openvino as ov
from openvino import Model, Type
from openvino.preprocess import PrePostProcessor
from openvino.runtime import opset12 as opset
from openvino.op import Constant
from openvino import Tensor

from .constants import (
LOGITS_OUTPUT_NAME,
Expand All @@ -21,7 +28,6 @@
rt_info_to_hf_attribute_map,
)


@dataclass
class TokenzierConversionParams:
"""
Expand Down Expand Up @@ -290,3 +296,64 @@ def quote_meta(unquoted: Union[str, bytes]) -> str:
symbols.append("\\")
symbols.append(char)
return "".join(symbols)


def to_bytes(number: int) -> bytes:
return number.to_bytes(4, "little")


class UnpackedOutputs:
_outputs = None
def __init__(self, outputs):
self._outputs = outputs

def outputs(self) -> List:
if self._outputs:
return self._outputs
else:
return []
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved


def create_unpacked_string(strings: Iterable[str]) -> UnpackedOutputs:
"""
Convert any list of strings to U8/1D numpy array with begins, ends, and chars
"""
strings = list(strings)
batch_size = len(strings)
if batch_size == 0:
return np.frombuffer(to_bytes(0), np.uint8)

buffer = BytesIO()
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
buffer.write(to_bytes(batch_size))
begins = BytesIO()
ends = BytesIO()
chars = BytesIO()
offset = 0

for string in strings:
byte_string = string.encode("utf-8") if isinstance(string, str) else string
length = len(byte_string)

begins.write(to_bytes(offset))
offset += length
ends.write(to_bytes(offset))
chars.write(byte_string)

begins = np.frombuffer(begins.getvalue(), np.int32)
ends = np.frombuffer(ends.getvalue(), np.int32)
chars = np.frombuffer(chars.getvalue(), np.uint8)

return UnpackedOutputs([Constant(Tensor(x)).output(0) for x in [begins, ends, chars]])


def create_str_constant(strings: Iterable[Union[str, bytes]]) -> Constant:
"""
Create a string constant from strings/bytes.
"""
strings = list(strings)
batch_size = len(strings)
if batch_size == 0:
return Constant(ov.Type.string, [])

strings = [bytes(string, "utf-8") if isinstance(string, str) else string for string in strings]
return Constant(Tensor(np.array(strings)))
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
18 changes: 15 additions & 3 deletions src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,21 @@ void RegexSplit::validate_and_infer_types() {
}

bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
auto split_pattern = std::string(inputs[5].data<const char>(), inputs[5].get_size());
auto input_size = get_input_size();
const bool has_skips = (input_size == 7);

auto r = inputs[5 + has_skips].get_element_type();
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
std::string split_pattern;
if (inputs[5 + has_skips].get_element_type() == element::u8) {
split_pattern = std::string(inputs[5 + has_skips].data<const char>(), inputs[5 + has_skips].get_size());
} else if (inputs[5 + has_skips].get_element_type() == element::string) {
split_pattern = *inputs[5 + has_skips].data<std::string>();
} else {
OPENVINO_THROW("Unsupported split pattern type: " + inputs[5 + has_skips].get_element_type().get_type_name());
}
auto pattern_size = inputs[5 + has_skips].get_size();
std::cout << "";
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved

// Write to common trie structures should be protected to prevent race conditions.
{
std::lock_guard<std::mutex> lock(m_mutex);
Expand All @@ -138,7 +152,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
}
};

auto input_size = get_input_size();
{
// Write to common trie structures should be protected to prevent race conditions.
std::lock_guard<std::mutex> lock(m_mutex);
Expand Down Expand Up @@ -169,7 +182,6 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp

bool * skips;
bool init_skips = false;
const bool has_skips = (input_size == 7);
if (has_skips) {
skips = inputs[5].data<bool>();
outputs[5].set_shape(Shape{num_chars});
Expand Down
8 changes: 7 additions & 1 deletion src/special_tokens_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,13 @@ void SpecialTokensSplit::validate_and_infer_types() {
}

bool SpecialTokensSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
auto split_pattern = std::string(inputs[5].data<const char>(), inputs[5].get_size());
std::string split_pattern;
if (inputs[5].get_element_type() == element::string) {
split_pattern = *inputs[5].data<std::string>();
} else {
split_pattern = std::string(inputs[5].data<const char>(), inputs[5].get_size());
}

compile_pattern_if_necessary(split_pattern);

auto input_size = get_input_size();
Expand Down
33 changes: 0 additions & 33 deletions src/string_tensor_pack.cpp

This file was deleted.

Loading
Loading