Skip to content

Commit

Permalink
Merge branch 'master' into support-glm-edge
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov authored Dec 20, 2024
2 parents 254672e + df5420c commit 0d31c78
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 49 deletions.
7 changes: 3 additions & 4 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ jobs:

- name: Upload openvino tokenizers package
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.tar.gz
Expand Down Expand Up @@ -202,7 +202,7 @@ jobs:

- name: Upload openvino tokenizers wheel
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
path: ${{ env.BUILD_DIR }}/*.whl
Expand Down Expand Up @@ -268,9 +268,8 @@ jobs:
env:
PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu"

- name: Tokenizers regression tests (using openvino python modules)
- name: Tokenizers regression tests (using openvino python wheels)
run: |
source ${INSTALL_DIR}/setupvars.sh
python3 -m pytest layer_tests.py
python3 -m pytest tokenizers_test.py
working-directory: ${{ env.OPENVINO_TOKENIZERS_REPO }}/tests
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:

- name: Upload openvino package
if: steps.openvino_download.outcome == 'success'
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_package
path: openvino_package.tar.gz
Expand Down Expand Up @@ -169,7 +169,7 @@ jobs:

- name: Upload openvino package
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_package
path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
Expand Down Expand Up @@ -252,7 +252,7 @@ jobs:

- name: Upload openvino tokenizers package
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.tar.gz
Expand Down Expand Up @@ -335,7 +335,7 @@ jobs:

- name: Upload openvino tokenizers wheel
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_tokenizers_wheel
path: ${{ env.BUILD_DIR }}/*.whl
Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/sdl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@ on:
- master
- 'releases/**'

permissions:
actions: read
contents: read
security-events: write
permissions: read-all

concurrency:
# github.ref is not unique in post-commit
Expand All @@ -29,6 +26,10 @@ jobs:
run:
shell: bash
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write

steps:
- name: Clone Openvino tokenizers sources and tests
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ jobs:
runs-on: aks-linux-2-cores-8gb
container:
image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
volumes: [ "/mount:/mount" ]
volumes:
- /mount:/mount
- ${{ github.workspace }}:${{ github.workspace }}
continue-on-error: true

steps:
Expand Down Expand Up @@ -145,7 +147,7 @@ jobs:

- name: Upload openvino tokenizers package
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
path: ${{ env.BUILD_DIR }}/*.zip
Expand Down Expand Up @@ -218,7 +220,7 @@ jobs:

- name: Upload openvino tokenizers wheel
if: ${{ always() }}
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: openvino_tokenizers_wheel
path: ${{ env.BUILD_DIR }}/*.whl
Expand Down
24 changes: 19 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,37 @@ name = "openvino-tokenizers"
version = "2025.0.0.0"
description = "Convert tokenizers into OpenVINO models"
requires-python = ">=3.9"
readme = {file = "README.md", content-type="text/markdown"}
license = {text = "OSI Approved :: Apache Software License"}
readme = { file = "README.md", content-type="text/markdown" }
license = { "file" = "LICENSE" }

authors = [
{ name = "OpenVINO Developers", email = "[email protected]" },
]

classifiers = [
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Operating System :: Unix",
"Operating System :: POSIX :: Linux",
"Operating System :: Microsoft :: Windows",
"Operating System :: MacOS",
"Programming Language :: C++",
"Programming Language :: C",
"Programming Language :: Python :: 3 :: Only",
]

dependencies = [
# support of nightly openvino packages with dev suffix
"openvino~=2025.0.0.0.dev"
"openvino~=2025.0.0.dev"
]

[project.optional-dependencies]
Expand Down Expand Up @@ -100,12 +113,13 @@ OPENVINO_TOKENIZERS_INSTALL_BINDIR = "openvino_tokenizers/lib"
OPENVINO_TOKENIZERS_INSTALL_LIBDIR = "openvino_tokenizers/lib"

[tool.py-build-cmake.wheel]
python_tag = ['py3']
python_abi = "none"

[build-system]
requires = [
"py-build-cmake==0.3.0",
"py-build-cmake==0.3.3",
"cmake~=3.14",
"openvino~=2025.0.0.0.dev"
"openvino~=2025.0.0.dev"
]
build-backend = "py_build_cmake.build"
12 changes: 6 additions & 6 deletions python/openvino_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
from itertools import chain
from pathlib import Path
from typing import Callable
from typing import Callable, Optional

import openvino
from openvino.runtime.utils.node_factory import NodeFactory
Expand Down Expand Up @@ -71,14 +71,14 @@ def new_fe_init(self, *args, **kwargs):


def _get_factory_callable() -> Callable[[], NodeFactory]:
factory = None
factory = {}

def inner() -> NodeFactory:
def inner(opset_version: Optional[str] = None) -> NodeFactory:
nonlocal factory
if factory is None:
factory = NodeFactory()
if factory.get(opset_version, False) == False:
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)

return factory
return factory[opset_version]

return inner

Expand Down
16 changes: 8 additions & 8 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from openvino import Model, PartialShape, Type
from openvino.runtime import Node, op
from openvino.runtime.exceptions import OVTypeError
from openvino.runtime.opset1.ops import _get_node_factory_opset1
from openvino.runtime.utils.types import as_node, make_constant_node
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.convert_slow_tokenizer import import_protobuf
Expand Down Expand Up @@ -391,7 +390,7 @@ def decoding(self) -> None:
self.pipeline.add_steps(CharsToBytesStep())
else:
self.pipeline.add_steps(FuseStep())

if self.utf8_replace_mode is not None and (self.utf8_replace_mode != UTF8ReplaceMode.DISABLE):
self.pipeline.add_steps(UTF8ValidateStep(mode=self.utf8_replace_mode))

Expand Down Expand Up @@ -446,16 +445,17 @@ def convert_fast_tokenizer(
filtered_outputs = []
for i, output_name in enumerate(ov_tokenizer_output_names):
current_output = next(
(output for output in ov_tokenizer.outputs if output.any_name == output_name),
(output for output in ov_tokenizer.outputs if output_name in output.names),
False,
)
if current_output:
filtered_outputs.append(current_output)
filtered_outputs[-1].add_names({output_name})
continue

if output_name in output_names:
ov_tokenizer.output(i).tensor.add_names({output_name})
filtered_outputs.append(ov_tokenizer.output(i))
filtered_outputs[-1].add_names({output_name})

tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_NAME)

Expand Down Expand Up @@ -836,10 +836,10 @@ def convert_sentencepiece_model_tokenizer(
)

if do_left_padding:
attention_mask = _get_node_factory_opset1().create(
attention_mask = _get_factory("opset1").create(
"Reverse", [attention_mask, make_constant_node(np.array([-1]))], {"mode": "index"}
)
scattered_input_ids = _get_node_factory_opset1().create(
scattered_input_ids = _get_factory("opset1").create(
"Reverse", [scattered_input_ids, make_constant_node(np.array([-1]))], {"mode": "index"}
)

Expand All @@ -863,8 +863,8 @@ def convert_sentencepiece_model_tokenizer(
outputs = scattered_input_ids.outputs()

if add_attention_mask:
attention_mask.output(0).tensor.add_names({ATTENTION_MASK_INPUT_NAME})
outputs.append(attention_mask.output(0))
outputs[-1].add_names({ATTENTION_MASK_INPUT_NAME})

tokenizer = Model(outputs, [input_node], TOKENIZER_NAME)
tokenizer.validate_nodes_and_infer_types()
Expand Down Expand Up @@ -982,7 +982,7 @@ def get_sp_detokenizer(

if params.clean_up_tokenization_spaces:
detokenizer = RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer)

last_sinks = detokenizer
if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)
Expand Down
4 changes: 2 additions & 2 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,8 +993,8 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
0
) # TODO: Change RaggedToDense to generate mask of any type

mask.tensor.add_names({ATTENTION_MASK_INPUT_NAME})
outputs.append(mask)
outputs[-1].add_names({ATTENTION_MASK_INPUT_NAME})

return outputs

Expand Down Expand Up @@ -1026,7 +1026,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
else:
vocab_outputs = self.create_string_constant_node(self.vocab).outputs()
input_nodes.extend(vocab_outputs)

# Put constant with skip tokens even if do_skip_tokens=False, so that it can be switched on/off at runtime.
# Slice through all skip tokens if flag is true, else slice to get an empty tensor.
stop_const = op.Constant(Type.i32, Shape([1]), [np.iinfo(np.int32).max if self.do_skip_tokens else 0])
Expand Down
7 changes: 5 additions & 2 deletions src/regex_normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ std::string reformat_replace_pattern(std::string replace_pattern) {

const std::map<std::string, std::string> search_pattern_rewrites = {
{R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))", R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))"},
{R"((^)(.))", R"((^)([\s\S]))"}
{R"((^)(.))", R"((^)([\s\S]))"},
{R"((^)(.+))", R"((^)([\s\S]))"}
};

/**
Expand All @@ -45,7 +46,9 @@ std::string fix_search_pattern(const std::string search_pattern) {
if (it == search_pattern_rewrites.end()) {
return search_pattern;
}
std::cerr << "Replace search pattern: `" << search_pattern << "` -> `" << it->second << "`" << std::endl;
if (getenv_bool("OPENVINO_TOKENIZERS_PRINT_DEBUG_INFO", false)) {
std::cerr << "Replace search pattern: `" << search_pattern << "` -> `" << it->second << "`" << std::endl;
}
return it->second;
}

Expand Down
Loading

0 comments on commit 0d31c78

Please sign in to comment.