From 2e97b09f9d6fde3c1fcae07f4249b3c2fa02ddcd Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Mon, 13 May 2024 15:39:08 -0700 Subject: [PATCH 1/4] Allow trust_remote_code to be specified in HuggingFaceClient kwargs --- src/helm/clients/huggingface_client.py | 41 ++++++++++++++++++++------ 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/src/helm/clients/huggingface_client.py b/src/helm/clients/huggingface_client.py index e25dfb62be..e6f8936074 100644 --- a/src/helm/clients/huggingface_client.py +++ b/src/helm/clients/huggingface_client.py @@ -82,17 +82,40 @@ def __init__(self, pretrained_model_name_or_path: str, openvino=False, **kwargs) export = True self.device = "cpu" - self.model = OVModelForCausalLM.from_pretrained( - pretrained_model_name_or_path, export=export, trust_remote_code=True, **kwargs - ).to(self.device) + # Security issue: currently we trust remote code by default. + # We retain this temporarily to maintain reverse compatibility. + # TODO: Delete if-else and don't set trust_remote_code=True + if "trust_remote_code" in kwargs: + self.model = OVModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, export=export, **kwargs + ).to(self.device) + else: + self.model = OVModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, export=export, trust_remote_code=True, **kwargs + ).to(self.device) else: - self.model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=True, **kwargs - ).to(self.device) + # Security issue: currently we trust remote code by default. + # We retain this temporarily to maintain reverse compatibility. + # TODO: Delete if-else and don't set trust_remote_code=True + if "trust_remote_code" in kwargs: + self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs).to( + self.device + ) + else: + self.model = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=True, **kwargs + ).to(self.device) + self.wrapped_tokenizer: WrappedPreTrainedTokenizer with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"): - self.wrapped_tokenizer: WrappedPreTrainedTokenizer = HuggingFaceTokenizer.create_tokenizer( - pretrained_model_name_or_path, **kwargs - ) + # Security issue: currently we trust remote code by default. + # We retain this temporarily to maintain reverse compatibility. + # TODO: Delete if-else and don't set trust_remote_code=True + if "trust_remote_code" in kwargs: + self.wrapped_tokenizer = HuggingFaceTokenizer.create_tokenizer(pretrained_model_name_or_path, **kwargs) + else: + self.wrapped_tokenizer = HuggingFaceTokenizer.create_tokenizer( + pretrained_model_name_or_path, trust_remote_code=True, **kwargs + ) def serve_request(self, raw_request: HuggingFaceRequest) -> Dict: with self.wrapped_tokenizer as tokenizer: From 1095751990da1b290b04ab843145ed2bb2818c54 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Mon, 13 May 2024 17:46:04 -0700 Subject: [PATCH 2/4] More changes --- src/helm/clients/huggingface_client.py | 46 +++++++++++++------ src/helm/clients/test_client.py | 6 +-- src/helm/clients/test_huggingface_client.py | 22 +++++++-- src/helm/config/model_deployments.yaml | 1 + src/helm/tokenizers/auto_tokenizer.py | 2 +- src/helm/tokenizers/huggingface_tokenizer.py | 39 ++++++++++------ .../tokenizers/test_huggingface_tokenizer.py | 6 ++- 7 files changed, 86 insertions(+), 36 deletions(-) diff --git a/src/helm/clients/huggingface_client.py b/src/helm/clients/huggingface_client.py index e6f8936074..a5dabb4068 100644 --- a/src/helm/clients/huggingface_client.py +++ b/src/helm/clients/huggingface_client.py @@ -17,6 +17,7 @@ GeneratedOutput, Token, ) +from helm.tokenizers.tokenizer import Tokenizer from .client import CachingClient, truncate_sequence from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer from threading import Lock @@ -53,7 +54,13 @@ class HuggingFaceRequest(TypedDict): class HuggingFaceServer: """A thin wrapper around a Hugging Face AutoModelForCausalLM for HuggingFaceClient to call.""" - def __init__(self, pretrained_model_name_or_path: str, openvino=False, **kwargs): + def __init__( + self, + pretrained_model_name_or_path: str, + wrapped_tokenizer: WrappedPreTrainedTokenizer, + openvino=False, + **kwargs, + ): if torch.cuda.is_available(): hlog("CUDA is available, initializing with a GPU...") self.device: str = "cuda:0" @@ -105,17 +112,7 @@ def __init__(self, pretrained_model_name_or_path: str, openvino=False, **kwargs) self.model = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path, trust_remote_code=True, **kwargs ).to(self.device) - self.wrapped_tokenizer: WrappedPreTrainedTokenizer - with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"): - # Security issue: currently we trust remote code by default. - # We retain this temporarily to maintain reverse compatibility. - # TODO: Delete if-else and don't set trust_remote_code=True - if "trust_remote_code" in kwargs: - self.wrapped_tokenizer = HuggingFaceTokenizer.create_tokenizer(pretrained_model_name_or_path, **kwargs) - else: - self.wrapped_tokenizer = HuggingFaceTokenizer.create_tokenizer( - pretrained_model_name_or_path, trust_remote_code=True, **kwargs - ) + self.wrapped_tokenizer = wrapped_tokenizer def serve_request(self, raw_request: HuggingFaceRequest) -> Dict: with self.wrapped_tokenizer as tokenizer: @@ -218,7 +215,12 @@ class HuggingFaceServerFactory: _servers_lock: Lock = Lock() @staticmethod - def get_server(helm_model_name: str, pretrained_model_name_or_path: str, **kwargs) -> Any: + def get_server( + helm_model_name: str, + pretrained_model_name_or_path: str, + wrapped_tokenizer: WrappedPreTrainedTokenizer, + **kwargs, + ) -> Any: """ Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached. Returns the HuggingFaceModel. @@ -230,7 +232,7 @@ def get_server(helm_model_name: str, pretrained_model_name_or_path: str, **kwarg f"for HELM model {helm_model_name} with Hugging Face Transformers" ): HuggingFaceServerFactory._servers[helm_model_name] = HuggingFaceServer( - pretrained_model_name_or_path, **kwargs + pretrained_model_name_or_path, wrapped_tokenizer, **kwargs ) return HuggingFaceServerFactory._servers[helm_model_name] @@ -262,9 +264,22 @@ def _process_huggingface_client_kwargs(raw_kwargs: Dict[str, Any]): class HuggingFaceClient(CachingClient): - def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs): + def __init__( + self, + cache_config: CacheConfig, + tokenizer: Tokenizer, + pretrained_model_name_or_path: Optional[str] = None, + **kwargs, + ): super().__init__(cache_config=cache_config) self._pretrained_model_name_or_path = pretrained_model_name_or_path + if not isinstance(tokenizer, HuggingFaceTokenizer): + raise ValueError( + f"Tokenizer for Hugging Face model {pretrained_model_name_or_path} must be a HuggingFaceTokenizer, " + "but instead it is {tokenizer}" + ) + self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_pretrained_tokenizer() + self._tokenizer = tokenizer self._kwargs = _process_huggingface_client_kwargs(kwargs) def make_request(self, request: Request) -> RequestResult: @@ -290,6 +305,7 @@ def make_request(self, request: Request) -> RequestResult: huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server( helm_model_name=request.model, pretrained_model_name_or_path=pretrained_model_name_or_path, + wrapped_tokenizer=self._wrapped_tokenizer, **self._kwargs, ) diff --git a/src/helm/clients/test_client.py b/src/helm/clients/test_client.py index 2ca0a7b050..36efc538d3 100644 --- a/src/helm/clients/test_client.py +++ b/src/helm/clients/test_client.py @@ -1,5 +1,5 @@ -from helm.common.cache import BlackHoleCacheConfig -from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer +from helm.common.cache_backend_config import BlackHoleCacheBackendConfig +from helm.tokenizers.auto_tokenizer import AutoTokenizer from .client import truncate_sequence, truncate_and_tokenize_response_text from typing import List from helm.common.request import Request, GeneratedOutput, Token @@ -52,8 +52,8 @@ def test_truncate_sequence(): def test_truncate_and_tokenize_response_text(): - tokenizer = HuggingFaceTokenizer(BlackHoleCacheConfig()) tokenizer_name = "huggingface/gpt2" + tokenizer = AutoTokenizer(credentials={}, cache_backend_config=BlackHoleCacheBackendConfig()) # No truncation response = truncate_and_tokenize_response_text( diff --git a/src/helm/clients/test_huggingface_client.py b/src/helm/clients/test_huggingface_client.py index 374a8d72f5..77d36ce523 100644 --- a/src/helm/clients/test_huggingface_client.py +++ b/src/helm/clients/test_huggingface_client.py @@ -3,12 +3,18 @@ from helm.common.cache import BlackHoleCacheConfig from helm.common.request import Request, RequestResult from helm.clients.huggingface_client import HuggingFaceClient +from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer class TestHuggingFaceClient: def test_gpt2(self): + tokenizer = HuggingFaceTokenizer( + BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2" + ) client = HuggingFaceClient( - cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2" + cache_config=BlackHoleCacheConfig(), + tokenizer=tokenizer, + pretrained_model_name_or_path="openai-community/gpt2", ) prompt: str = "I am a computer scientist." result: RequestResult = client.make_request( @@ -29,8 +35,13 @@ def test_gpt2(self): @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.") def test_gptj_6b(self): + tokenizer = HuggingFaceTokenizer( + BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2" + ) client = HuggingFaceClient( - cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2" + cache_config=BlackHoleCacheConfig(), + tokenizer=tokenizer, + pretrained_model_name_or_path="openai-community/gpt2", ) result: RequestResult = client.make_request( Request( @@ -45,8 +56,13 @@ def test_gptj_6b(self): assert len(result.completions) == 3 def test_logprob(self): + tokenizer = HuggingFaceTokenizer( + BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2" + ) client = HuggingFaceClient( - cache_config=BlackHoleCacheConfig(), pretrained_model_name_or_path="openai-community/gpt2" + cache_config=BlackHoleCacheConfig(), + tokenizer=tokenizer, + pretrained_model_name_or_path="openai-community/gpt2", ) prompt: str = "I am a computer scientist." result: RequestResult = client.make_request( diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index f69b901251..cac18f6da8 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -810,6 +810,7 @@ model_deployments: class_name: "helm.clients.huggingface_client.HuggingFaceClient" args: pretrained_model_name_or_path: openai-community/gpt2 + trust_remote_code: false ## StabilityAI - name: huggingface/stablelm-base-alpha-3b diff --git a/src/helm/tokenizers/auto_tokenizer.py b/src/helm/tokenizers/auto_tokenizer.py index 112c8e7750..3bf95ec706 100644 --- a/src/helm/tokenizers/auto_tokenizer.py +++ b/src/helm/tokenizers/auto_tokenizer.py @@ -41,7 +41,7 @@ def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer: if tokenizer_config: tokenizer_spec = inject_object_spec_args( tokenizer_config.tokenizer_spec, - constant_bindings={"cache_config": cache_config}, + constant_bindings={"cache_config": cache_config, "tokenizer_name": tokenizer_name}, provider_bindings={ "api_key": lambda: provide_api_key(self.credentials, organization), "project_id": lambda: self.credentials.get(organization + "ProjectId", None), # VertexAI diff --git a/src/helm/tokenizers/huggingface_tokenizer.py b/src/helm/tokenizers/huggingface_tokenizer.py index 359bd7722d..fd51a0d992 100644 --- a/src/helm/tokenizers/huggingface_tokenizer.py +++ b/src/helm/tokenizers/huggingface_tokenizer.py @@ -29,8 +29,17 @@ class HuggingFaceTokenizer(CachingTokenizer): _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {} _tokenizers_lock: Lock = Lock() - def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs): + def __init__( + self, + cache_config: CacheConfig, + tokenizer_name: str, + pretrained_model_name_or_path: Optional[str] = None, + **kwargs, + ): super().__init__(cache_config=cache_config) + self._helm_tokenizer_name = ( + tokenizer_name # HELM tokenizer name (e.g. "huggingface/gpt2"), *not* Hugging Face Hub Model ID + ) self._pretrained_model_name_or_path = pretrained_model_name_or_path self._kwargs = kwargs @@ -40,7 +49,11 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPre # To avoid deadlocks when using HuggingFace tokenizers with multiple processes # TODO: Figure out if we actually need this. os.environ["TOKENIZERS_PARALLELISM"] = "False" - + from_pretrained_kwargs = {**kwargs} + # If unspecified, set `use_fast=True` by default. + if "use_fast" not in from_pretrained_kwargs: + from_pretrained_kwargs["use_fast"] = True + print(from_pretrained_kwargs) try: # From the Hugging Face documentation, "local_files_only(defaults to False) — # Whether or not to only look at local files". @@ -53,14 +66,14 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPre # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available. return WrappedPreTrainedTokenizer( AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs + pretrained_model_name_or_path, local_files_only=True, **from_pretrained_kwargs ) ) except OSError: hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...") return WrappedPreTrainedTokenizer( AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs + pretrained_model_name_or_path, local_files_only=False, **from_pretrained_kwargs ) ) @@ -84,13 +97,13 @@ def get_tokenizer( ) return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] - def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrainedTokenizer: - """Method used in both _tokenize_do_it and _decode_do_it to get the tokenizer.""" + def get_pretrained_tokenizer(self) -> WrappedPreTrainedTokenizer: + """Get the underlying Hugging Face WrappedPreTrainedTokenizer.""" pretrained_model_name_or_path = ( - self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else request["tokenizer"] + self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else self._helm_tokenizer_name ) return HuggingFaceTokenizer.get_tokenizer( - helm_tokenizer_name=request["tokenizer"], + helm_tokenizer_name=self._helm_tokenizer_name, pretrained_model_name_or_path=pretrained_model_name_or_path, **self._kwargs, ) @@ -98,7 +111,7 @@ def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrain def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: if request["encode"]: if request["truncation"]: - with self._get_tokenizer_for_request(request) as tokenizer: + with self.get_pretrained_tokenizer() as tokenizer: tokens = tokenizer.encode( request["text"], truncation=request["truncation"], @@ -106,7 +119,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: add_special_tokens=False, ) else: - with self._get_tokenizer_for_request(request) as tokenizer: + with self.get_pretrained_tokenizer() as tokenizer: tokens = tokenizer.encode(request["text"], add_special_tokens=False) else: if "gpt" in request["tokenizer"] or request["tokenizer"] in [ @@ -118,7 +131,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: # convert_tokens_to_string method. We prefer to use this method instead # of the hacky cleanup_tokens method below as it might handle cases # we haven't thought of in cleanup_tokens. - with self._get_tokenizer_for_request(request) as tokenizer: + with self.get_pretrained_tokenizer() as tokenizer: tokens = [ tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"]) ] @@ -131,7 +144,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: # But this replaces all the "▁" characters by "", which is not what we want. # This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"] # Just like tokenize("Hello", encode=False) would return ["Hello"]. - with self._get_tokenizer_for_request(request) as tokenizer: + with self.get_pretrained_tokenizer() as tokenizer: tokens = tokenizer.tokenize(request["text"]) # Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings. if tokens and type(tokens[0]) == bytes: @@ -140,7 +153,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: return {"tokens": tokens} def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: - with self._get_tokenizer_for_request(request) as tokenizer: + with self.get_pretrained_tokenizer() as tokenizer: text = tokenizer.decode( request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"] ) diff --git a/src/helm/tokenizers/test_huggingface_tokenizer.py b/src/helm/tokenizers/test_huggingface_tokenizer.py index 437779c3b8..a94cf1e1c3 100644 --- a/src/helm/tokenizers/test_huggingface_tokenizer.py +++ b/src/helm/tokenizers/test_huggingface_tokenizer.py @@ -17,7 +17,11 @@ class TestHuggingFaceGPT2Tokenizer: def setup_method(self, method): cache_file = tempfile.NamedTemporaryFile(delete=False) self.cache_path: str = cache_file.name - self.tokenizer = HuggingFaceTokenizer(SqliteCacheConfig(self.cache_path)) + self.tokenizer = HuggingFaceTokenizer( + SqliteCacheConfig(self.cache_path), + tokenizer_name="huggingface/gpt2", + pretrained_model_name_or_path="openai-community/gpt2", + ) def teardown_method(self, method): os.remove(self.cache_path) From eaccf94c5658b66347330a73714ec21637c72c7c Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Mon, 13 May 2024 18:03:25 -0700 Subject: [PATCH 3/4] More fixes --- src/helm/tokenizers/huggingface_tokenizer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/helm/tokenizers/huggingface_tokenizer.py b/src/helm/tokenizers/huggingface_tokenizer.py index fd51a0d992..d9aec51cd8 100644 --- a/src/helm/tokenizers/huggingface_tokenizer.py +++ b/src/helm/tokenizers/huggingface_tokenizer.py @@ -109,6 +109,11 @@ def get_pretrained_tokenizer(self) -> WrappedPreTrainedTokenizer: ) def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: + if request["tokenizer"] != self._helm_tokenizer_name: + raise ValueError( + f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} " + "but instead the request has tokenizer {request['tokenizer']}" + ) if request["encode"]: if request["truncation"]: with self.get_pretrained_tokenizer() as tokenizer: @@ -153,6 +158,11 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: return {"tokens": tokens} def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: + if request["tokenizer"] != self._helm_tokenizer_name: + raise ValueError( + f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} " + "but instead the request has tokenizer {request['tokenizer']}" + ) with self.get_pretrained_tokenizer() as tokenizer: text = tokenizer.decode( request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"] From 8a5e714cc8e2f0d6868d143a907b05976ce93ff1 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Mon, 13 May 2024 18:05:47 -0700 Subject: [PATCH 4/4] Some changes --- src/helm/clients/huggingface_client.py | 2 +- src/helm/config/model_deployments.yaml | 1 - src/helm/tokenizers/huggingface_tokenizer.py | 12 ++++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/helm/clients/huggingface_client.py b/src/helm/clients/huggingface_client.py index a5dabb4068..3099b9ca14 100644 --- a/src/helm/clients/huggingface_client.py +++ b/src/helm/clients/huggingface_client.py @@ -278,7 +278,7 @@ def __init__( f"Tokenizer for Hugging Face model {pretrained_model_name_or_path} must be a HuggingFaceTokenizer, " "but instead it is {tokenizer}" ) - self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_pretrained_tokenizer() + self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_wrapped_tokenizer() self._tokenizer = tokenizer self._kwargs = _process_huggingface_client_kwargs(kwargs) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml index cac18f6da8..f69b901251 100644 --- a/src/helm/config/model_deployments.yaml +++ b/src/helm/config/model_deployments.yaml @@ -810,7 +810,6 @@ model_deployments: class_name: "helm.clients.huggingface_client.HuggingFaceClient" args: pretrained_model_name_or_path: openai-community/gpt2 - trust_remote_code: false ## StabilityAI - name: huggingface/stablelm-base-alpha-3b diff --git a/src/helm/tokenizers/huggingface_tokenizer.py b/src/helm/tokenizers/huggingface_tokenizer.py index d9aec51cd8..4379187919 100644 --- a/src/helm/tokenizers/huggingface_tokenizer.py +++ b/src/helm/tokenizers/huggingface_tokenizer.py @@ -97,7 +97,7 @@ def get_tokenizer( ) return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] - def get_pretrained_tokenizer(self) -> WrappedPreTrainedTokenizer: + def get_wrapped_tokenizer(self) -> WrappedPreTrainedTokenizer: """Get the underlying Hugging Face WrappedPreTrainedTokenizer.""" pretrained_model_name_or_path = ( self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else self._helm_tokenizer_name @@ -116,7 +116,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: ) if request["encode"]: if request["truncation"]: - with self.get_pretrained_tokenizer() as tokenizer: + with self.get_wrapped_tokenizer() as tokenizer: tokens = tokenizer.encode( request["text"], truncation=request["truncation"], @@ -124,7 +124,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: add_special_tokens=False, ) else: - with self.get_pretrained_tokenizer() as tokenizer: + with self.get_wrapped_tokenizer() as tokenizer: tokens = tokenizer.encode(request["text"], add_special_tokens=False) else: if "gpt" in request["tokenizer"] or request["tokenizer"] in [ @@ -136,7 +136,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: # convert_tokens_to_string method. We prefer to use this method instead # of the hacky cleanup_tokens method below as it might handle cases # we haven't thought of in cleanup_tokens. - with self.get_pretrained_tokenizer() as tokenizer: + with self.get_wrapped_tokenizer() as tokenizer: tokens = [ tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"]) ] @@ -149,7 +149,7 @@ def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: # But this replaces all the "▁" characters by "", which is not what we want. # This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"] # Just like tokenize("Hello", encode=False) would return ["Hello"]. - with self.get_pretrained_tokenizer() as tokenizer: + with self.get_wrapped_tokenizer() as tokenizer: tokens = tokenizer.tokenize(request["text"]) # Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings. if tokens and type(tokens[0]) == bytes: @@ -163,7 +163,7 @@ def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: f"This HuggingFaceTokenizer expects tokenizer to be {self._helm_tokenizer_name} " "but instead the request has tokenizer {request['tokenizer']}" ) - with self.get_pretrained_tokenizer() as tokenizer: + with self.get_wrapped_tokenizer() as tokenizer: text = tokenizer.decode( request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"] )