Merge branch 'master' into support-glm-edge

openvinotoolkit · Dec 20, 2024 · 0d31c78 · 0d31c78
2 parents 254672e + df5420c
commit 0d31c78
Show file tree

Hide file tree

Showing 12 changed files with 112 additions and 49 deletions.
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -136,7 +136,7 @@ jobs:
 
       - name: Upload openvino tokenizers package
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
           path: ${{ env.BUILD_DIR }}/*.tar.gz
@@ -202,7 +202,7 @@ jobs:
 
       - name: Upload openvino tokenizers wheel
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_tokenizers_wheel_${{ matrix.build_fast_tokenizers }}
           path: ${{ env.BUILD_DIR }}/*.whl
@@ -268,9 +268,8 @@ jobs:
         env:
           PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu"
 
-      - name: Tokenizers regression tests (using openvino python modules)
+      - name: Tokenizers regression tests (using openvino python wheels)
         run: |
-          source ${INSTALL_DIR}/setupvars.sh
           python3 -m pytest layer_tests.py
           python3 -m pytest tokenizers_test.py
         working-directory: ${{ env.OPENVINO_TOKENIZERS_REPO }}/tests

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -46,7 +46,7 @@ jobs:
 
       - name: Upload openvino package
         if: steps.openvino_download.outcome == 'success'
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_package
           path: openvino_package.tar.gz
@@ -169,7 +169,7 @@ jobs:
 
       - name: Upload openvino package
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_package
           path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
@@ -252,7 +252,7 @@ jobs:
 
       - name: Upload openvino tokenizers package
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
           path: ${{ env.BUILD_DIR }}/*.tar.gz
@@ -335,7 +335,7 @@ jobs:
 
       - name: Upload openvino tokenizers wheel
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_tokenizers_wheel
           path: ${{ env.BUILD_DIR }}/*.whl

diff --git a/.github/workflows/sdl.yml b/.github/workflows/sdl.yml
@@ -8,10 +8,7 @@ on:
       - master
       - 'releases/**'
 
-permissions:
-  actions: read
-  contents: read
-  security-events: write
+permissions: read-all
 
 concurrency:
   # github.ref is not unique in post-commit
@@ -29,6 +26,10 @@ jobs:
       run:
         shell: bash
     runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
 
     steps:
       - name: Clone Openvino tokenizers sources and tests

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -33,7 +33,9 @@ jobs:
     runs-on: aks-linux-2-cores-8gb
     container:
       image: 'openvinogithubactions.azurecr.io/openvino_provider:0.1.0'
-      volumes: [ "/mount:/mount" ]
+      volumes:
+        - /mount:/mount
+        - ${{ github.workspace }}:${{ github.workspace }}
     continue-on-error: true
 
     steps:
@@ -145,7 +147,7 @@ jobs:
 
       - name: Upload openvino tokenizers package
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_tokenizers_cpack_${{ matrix.build_fast_tokenizers }}_${{ matrix.build_type }}
           path: ${{ env.BUILD_DIR }}/*.zip
@@ -218,7 +220,7 @@ jobs:
 
       - name: Upload openvino tokenizers wheel
         if: ${{ always() }}
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: openvino_tokenizers_wheel
           path: ${{ env.BUILD_DIR }}/*.whl

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,24 +3,37 @@ name = "openvino-tokenizers"
 version = "2025.0.0.0"
 description = "Convert tokenizers into OpenVINO models"
 requires-python = ">=3.9"
-readme = {file = "README.md", content-type="text/markdown"}
-license = {text = "OSI Approved :: Apache Software License"}
+readme = { file = "README.md", content-type="text/markdown" }
+license = { "file" = "LICENSE" }
 
 authors = [
     { name = "OpenVINO Developers", email = "[email protected]" },
 ]
 
 classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Operating System :: Unix",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: MacOS",
+    "Programming Language :: C++",
+    "Programming Language :: C",
+    "Programming Language :: Python :: 3 :: Only",
 ]
 
 dependencies = [
     # support of nightly openvino packages with dev suffix
-    "openvino~=2025.0.0.0.dev"
+    "openvino~=2025.0.0.dev"
 ]
 
 [project.optional-dependencies]
@@ -100,12 +113,13 @@ OPENVINO_TOKENIZERS_INSTALL_BINDIR = "openvino_tokenizers/lib"
 OPENVINO_TOKENIZERS_INSTALL_LIBDIR = "openvino_tokenizers/lib"
 
 [tool.py-build-cmake.wheel]
+python_tag = ['py3']
 python_abi = "none"
 
 [build-system]
 requires = [
-    "py-build-cmake==0.3.0",
+    "py-build-cmake==0.3.3",
     "cmake~=3.14",
-    "openvino~=2025.0.0.0.dev"
+    "openvino~=2025.0.0.dev"
 ]
 build-backend = "py_build_cmake.build"
diff --git a/python/openvino_tokenizers/__init__.py b/python/openvino_tokenizers/__init__.py
@@ -7,7 +7,7 @@
 import sys
 from itertools import chain
 from pathlib import Path
-from typing import Callable
+from typing import Callable, Optional
 
 import openvino
 from openvino.runtime.utils.node_factory import NodeFactory
@@ -71,14 +71,14 @@ def new_fe_init(self, *args, **kwargs):
 
 
 def _get_factory_callable() -> Callable[[], NodeFactory]:
-    factory = None
+    factory = {}
 
-    def inner() -> NodeFactory:
+    def inner(opset_version: Optional[str] = None) -> NodeFactory:
         nonlocal factory
-        if factory is None:
-            factory = NodeFactory()
+        if factory.get(opset_version, False) == False:
+            factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)
 
-        return factory
+        return factory[opset_version]
 
     return inner
 

diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -16,7 +16,6 @@
 from openvino import Model, PartialShape, Type
 from openvino.runtime import Node, op
 from openvino.runtime.exceptions import OVTypeError
-from openvino.runtime.opset1.ops import _get_node_factory_opset1
 from openvino.runtime.utils.types import as_node, make_constant_node
 from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
 from transformers.convert_slow_tokenizer import import_protobuf
@@ -391,7 +390,7 @@ def decoding(self) -> None:
             self.pipeline.add_steps(CharsToBytesStep())
         else:
             self.pipeline.add_steps(FuseStep())
-        
+
         if self.utf8_replace_mode is not None and (self.utf8_replace_mode != UTF8ReplaceMode.DISABLE):
             self.pipeline.add_steps(UTF8ValidateStep(mode=self.utf8_replace_mode))
 
@@ -446,16 +445,17 @@ def convert_fast_tokenizer(
     filtered_outputs = []
     for i, output_name in enumerate(ov_tokenizer_output_names):
         current_output = next(
-            (output for output in ov_tokenizer.outputs if output.any_name == output_name),
+            (output for output in ov_tokenizer.outputs if output_name in output.names),
             False,
         )
         if current_output:
             filtered_outputs.append(current_output)
+            filtered_outputs[-1].add_names({output_name})
             continue
 
         if output_name in output_names:
-            ov_tokenizer.output(i).tensor.add_names({output_name})
             filtered_outputs.append(ov_tokenizer.output(i))
+            filtered_outputs[-1].add_names({output_name})
 
     tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_NAME)
 
@@ -836,10 +836,10 @@ def convert_sentencepiece_model_tokenizer(
     )
 
     if do_left_padding:
-        attention_mask = _get_node_factory_opset1().create(
+        attention_mask = _get_factory("opset1").create(
             "Reverse", [attention_mask, make_constant_node(np.array([-1]))], {"mode": "index"}
         )
-        scattered_input_ids = _get_node_factory_opset1().create(
+        scattered_input_ids = _get_factory("opset1").create(
             "Reverse", [scattered_input_ids, make_constant_node(np.array([-1]))], {"mode": "index"}
         )
 
@@ -863,8 +863,8 @@ def convert_sentencepiece_model_tokenizer(
     outputs = scattered_input_ids.outputs()
 
     if add_attention_mask:
-        attention_mask.output(0).tensor.add_names({ATTENTION_MASK_INPUT_NAME})
         outputs.append(attention_mask.output(0))
+        outputs[-1].add_names({ATTENTION_MASK_INPUT_NAME})
 
     tokenizer = Model(outputs, [input_node], TOKENIZER_NAME)
     tokenizer.validate_nodes_and_infer_types()
@@ -982,7 +982,7 @@ def get_sp_detokenizer(
 
     if params.clean_up_tokenization_spaces:
         detokenizer = RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer)
-    
+
     last_sinks = detokenizer
     if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
         last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -993,8 +993,8 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
                     0
                 )  # TODO: Change RaggedToDense to generate mask of any type
 
-        mask.tensor.add_names({ATTENTION_MASK_INPUT_NAME})
         outputs.append(mask)
+        outputs[-1].add_names({ATTENTION_MASK_INPUT_NAME})
 
         return outputs
 
@@ -1026,7 +1026,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         else:
             vocab_outputs = self.create_string_constant_node(self.vocab).outputs()
         input_nodes.extend(vocab_outputs)
-        
+
         # Put constant with skip tokens even if do_skip_tokens=False, so that it can be switched on/off at runtime.
         # Slice through all skip tokens if flag is true, else slice to get an empty tensor.
         stop_const = op.Constant(Type.i32, Shape([1]), [np.iinfo(np.int32).max if self.do_skip_tokens else 0])

diff --git a/src/regex_normalization.cpp b/src/regex_normalization.cpp
@@ -31,7 +31,8 @@ std::string reformat_replace_pattern(std::string replace_pattern) {
 
 const std::map<std::string, std::string> search_pattern_rewrites = {
     {R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))", R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))"},
-    {R"((^)(.))", R"((^)([\s\S]))"}
+    {R"((^)(.))", R"((^)([\s\S]))"},
+    {R"((^)(.+))", R"((^)([\s\S]))"}
 };
 
 /**
@@ -45,7 +46,9 @@ std::string fix_search_pattern(const std::string search_pattern) {
     if (it == search_pattern_rewrites.end()) {
         return search_pattern;
     }
-    std::cerr << "Replace search pattern: `" << search_pattern << "` -> `" << it->second << "`" << std::endl;
+    if (getenv_bool("OPENVINO_TOKENIZERS_PRINT_DEBUG_INFO", false)) {
+        std::cerr << "Replace search pattern: `" << search_pattern << "` -> `" << it->second << "`" << std::endl;
+    }
     return it->second;
 }