feat: add jamba 1.5 tokenizers (mini and large) (#96)

* feat: add jamba 1.5 tokenizers (mini and large) * test: fix value error on conftest.py * refactor: rename file name * chore: fix path * fix: enter
AI21Labs · Aug 21, 2024 · e052b22 · e052b22
1 parent 5be5b59
commit e052b22
Show file tree

Hide file tree

Showing 12 changed files with 612 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -35,7 +35,63 @@ poetry add ai21-tokenizer
 
 ### Tokenizer Creation
 
-### Jamba Tokenizer
+### Jamba 1.5 Mini Tokenizer
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER)
+# Your code here
+```
+
+Another way would be to use our Jamba 1.5 Mini tokenizer directly:
+
+```python
+from ai21_tokenizer import Jamba1_5Tokenizer
+
+model_path = "<Path to your vocabs file>"
+tokenizer = Jamba1_5Tokenizer(model_path=model_path)
+# Your code here
+```
+
+#### Async usage
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER)
+# Your code here
+```
+
+### Jamba 1.5 Large Tokenizer
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER)
+# Your code here
+```
+
+Another way would be to use our Jamba 1.5 Large tokenizer directly:
+
+```python
+from ai21_tokenizer import Jamba1_5Tokenizer
+
+model_path = "<Path to your vocabs file>"
+tokenizer = Jamba1_5Tokenizer(model_path=model_path)
+# Your code here
+```
+
+#### Async usage
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER)
+# Your code here
+```
+
+### Jamba Instruct Tokenizer
 
 ```python
 from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
@@ -59,7 +115,7 @@ tokenizer = JambaInstructTokenizer(model_path=model_path)
 ```python
 from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
 
-tokenizer = Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
+tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
 # Your code here
 ```
 
@@ -97,7 +153,7 @@ tokenizer = JurassicTokenizer(model_path=model_path, config=config)
 ```python
 from ai21_tokenizer import Tokenizer
 
-tokenizer = Tokenizer.get_async_tokenizer()
+tokenizer = await Tokenizer.get_async_tokenizer()
 # Your code here
 ```
 

diff --git a/ai21_tokenizer/__init__.py b/ai21_tokenizer/__init__.py
@@ -2,6 +2,7 @@
 from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer
 from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer, AsyncJurassicTokenizer
 from ai21_tokenizer.tokenizer_factory import TokenizerFactory as Tokenizer, PreTrainedTokenizers
+from ai21_tokenizer.jamba_1_5_tokenizer import Jamba1_5Tokenizer, AsyncJamba1_5Tokenizer
 from .version import VERSION
 
 __version__ = VERSION
@@ -16,4 +17,6 @@
     "PreTrainedTokenizers",
     "JambaInstructTokenizer",
     "AsyncJambaInstructTokenizer",
+    "Jamba1_5Tokenizer",
+    "AsyncJamba1_5Tokenizer",
 ]
diff --git a/...okenizer/base_jamba_instruct_tokenizer.py → ai21_tokenizer/base_jamba_tokenizer.py b/...okenizer/base_jamba_instruct_tokenizer.py → ai21_tokenizer/base_jamba_tokenizer.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import os
-import tempfile
 from pathlib import Path
 from typing import List, Union, Optional
 from abc import ABC, abstractmethod
@@ -11,10 +10,9 @@
 from ai21_tokenizer.file_utils import PathLike
 
 _TOKENIZER_FILE = "tokenizer.json"
-_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_instruct"
 
 
-class BaseJambaInstructTokenizer(ABC):
+class BaseJambaTokenizer(ABC):
     _tokenizer: Optional[Tokenizer] = None
 
     @abstractmethod

diff --git a/ai21_tokenizer/jamba_1_5_tokenizer.py b/ai21_tokenizer/jamba_1_5_tokenizer.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from typing import Union, List, Optional, cast
+
+from tokenizers import Tokenizer
+
+from ai21_tokenizer import BaseTokenizer, AsyncBaseTokenizer
+from ai21_tokenizer.file_utils import PathLike
+from ai21_tokenizer.base_jamba_tokenizer import BaseJambaTokenizer
+
+_TOKENIZER_FILE = "tokenizer.json"
+_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_1_5"
+
+
+class Jamba1_5Tokenizer(BaseJambaTokenizer, BaseTokenizer):
+    def __init__(
+        self,
+        model_path: str,
+        cache_dir: Optional[PathLike] = None,
+    ):
+        """
+        Args:
+            model_path: str
+                The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file
+            cache_dir: Optional[PathLike]
+                The directory to cache the tokenizer.json file.
+                 If not provided, the default cache directory will be used
+        """
+        self._tokenizer = self._init_tokenizer(model_path=model_path, cache_dir=cache_dir or _DEFAULT_MODEL_CACHE_DIR)
+
+    def _init_tokenizer(self, model_path: PathLike, cache_dir: PathLike) -> Tokenizer:
+        if self._is_cached(cache_dir):
+            return self._load_from_cache(cache_dir / _TOKENIZER_FILE)
+
+        tokenizer = cast(
+            Tokenizer,
+            Tokenizer.from_pretrained(model_path),
+        )
+        self._cache_tokenizer(tokenizer, cache_dir)
+
+        return tokenizer
+
+    def _load_from_cache(self, cache_file: Path) -> Tokenizer:
+        return cast(Tokenizer, Tokenizer.from_file(str(cache_file)))
+
+    def encode(self, text: str, **kwargs) -> List[int]:
+        return self._encode(text, **kwargs)
+
+    def decode(self, token_ids: List[int], **kwargs) -> str:
+        return self._decode(token_ids, **kwargs)
+
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        return self._convert_tokens_to_ids(tokens)
+
+    def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> Union[str, List[str]]:
+        return self._convert_ids_to_tokens(token_ids)
+
+    @property
+    def vocab_size(self) -> int:
+        return self._tokenizer.get_vocab_size()
+
+
+class AsyncJamba1_5Tokenizer(BaseJambaTokenizer, AsyncBaseTokenizer):
+    _model_path: str
+    _tokenizer: Tokenizer = None
+    _cache_dir: PathLike = None
+
+    def __init__(self):
+        raise ValueError(
+            "Do not create AsyncJamba1_5Tokenizer directly. Use either AsyncJamba1_5Tokenizer.create or "
+            "Tokenizer.get_async_tokenizer"
+        )
+
+    @classmethod
+    async def create(
+        cls,
+        model_path: str,
+        cache_dir: Optional[PathLike] = None,
+    ):
+        """
+        Args:
+            model_path: str
+                The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file
+            cache_dir: Optional[PathLike]
+                The directory to cache the tokenizer.json file.
+                 If not provided, the default cache directory will be used
+        """
+        self = cls.__new__(cls)
+        self._model_path = model_path
+        self._cache_dir = cache_dir or _DEFAULT_MODEL_CACHE_DIR
+        await self._init_tokenizer()
+        return self
+
+    async def encode(self, text: str, **kwargs) -> List[int]:
+        if not self._tokenizer:
+            await self._init_tokenizer()
+
+        return await self._make_async_call(callback_func=self._encode, text=text, **kwargs)
+
+    async def decode(self, token_ids: List[int], **kwargs) -> str:
+        if not self._tokenizer:
+            await self._init_tokenizer()
+
+        return await self._make_async_call(callback_func=self._decode, token_ids=token_ids, **kwargs)
+
+    async def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if not self._tokenizer:
+            await self._init_tokenizer()
+
+        return await self._make_async_call(callback_func=self._convert_tokens_to_ids, tokens=tokens)
+
+    async def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> Union[str, List[str]]:
+        if not self._tokenizer:
+            await self._init_tokenizer()
+
+        return await self._make_async_call(callback_func=self._convert_ids_to_tokens, token_ids=token_ids, **kwargs)
+
+    @property
+    def vocab_size(self) -> int:
+        if not self._tokenizer:
+            raise ValueError(
+                "Tokenizer not properly initialized. Please do not initialize the tokenizer directly. Use "
+                "Tokenizer.get_async_tokenizer instead."
+            )
+        return self._tokenizer.get_vocab_size()
+
+    async def _init_tokenizer(self):
+        if self._is_cached(self._cache_dir):
+            self._tokenizer = await self._load_from_cache(self._cache_dir / _TOKENIZER_FILE)
+        else:
+            tokenizer_from_pretrained = await self._make_async_call(
+                callback_func=Tokenizer.from_pretrained,
+                identifier=self._model_path,
+            )
+
+            tokenizer = cast(
+                Tokenizer,
+                tokenizer_from_pretrained,
+            )
+            self._cache_tokenizer(tokenizer, self._cache_dir)
+
+            self._tokenizer = tokenizer
+
+    async def _load_from_cache(self, cache_file: Path) -> Tokenizer:
+        tokenizer_from_file = await self._make_async_call(callback_func=Tokenizer.from_file, path=str(cache_file))
+        return cast(Tokenizer, tokenizer_from_file)
diff --git a/ai21_tokenizer/jamba_instruct_tokenizer.py b/ai21_tokenizer/jamba_instruct_tokenizer.py
@@ -8,13 +8,13 @@
 
 from ai21_tokenizer import BaseTokenizer, AsyncBaseTokenizer
 from ai21_tokenizer.file_utils import PathLike
-from ai21_tokenizer.base_jamba_instruct_tokenizer import BaseJambaInstructTokenizer
+from ai21_tokenizer.base_jamba_tokenizer import BaseJambaTokenizer
 
 _TOKENIZER_FILE = "tokenizer.json"
 _DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_instruct"
 
 
-class JambaInstructTokenizer(BaseJambaInstructTokenizer, BaseTokenizer):
+class JambaInstructTokenizer(BaseJambaTokenizer, BaseTokenizer):
     def __init__(
         self,
         model_path: str,
@@ -62,7 +62,7 @@ def vocab_size(self) -> int:
         return self._tokenizer.get_vocab_size()
 
 
-class AsyncJambaInstructTokenizer(BaseJambaInstructTokenizer, AsyncBaseTokenizer):
+class AsyncJambaInstructTokenizer(BaseJambaTokenizer, AsyncBaseTokenizer):
     _model_path: str
     _tokenizer: Tokenizer = None
     _cache_dir: PathLike = None

diff --git a/ai21_tokenizer/tokenizer_factory.py b/ai21_tokenizer/tokenizer_factory.py
@@ -1,19 +1,37 @@
 import os
+import tempfile
 from pathlib import Path
 
 from ai21_tokenizer.base_tokenizer import BaseTokenizer, AsyncBaseTokenizer
 from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer
+from ai21_tokenizer.jamba_1_5_tokenizer import Jamba1_5Tokenizer, AsyncJamba1_5Tokenizer
 from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer, AsyncJurassicTokenizer
 
 _LOCAL_RESOURCES_PATH = Path(__file__).parent / "resources"
 _ENV_CACHE_DIR_KEY = "AI21_TOKENIZER_CACHE_DIR"
 JAMBA_TOKENIZER_HF_PATH = "ai21labs/Jamba-v0.1"
+JAMBA_1_5_MINI_TOKENIZER_HF_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
+JAMBA_1_5_LARGE_TOKENIZER_HF_PATH = "ai21labs/AI21-Jamba-1.5-Large"
+
+
+def _get_cache_dir(tokenizer_name: str) -> Path:
+    tokenizer_name_as_path = tokenizer_name.replace(".", "_")
+    tokenizer_name_as_path = tokenizer_name_as_path.replace("-", "_")
+    default_tokenizer_cache_dir = Path(tempfile.gettempdir()) / tokenizer_name_as_path
+    env_cache_from_env = os.getenv(_ENV_CACHE_DIR_KEY)
+
+    if env_cache_from_env is not None:
+        return Path(env_cache_from_env)
+
+    return default_tokenizer_cache_dir
 
 
 class PreTrainedTokenizers:
     J2_TOKENIZER = "j2-tokenizer"
     JAMBA_INSTRUCT_TOKENIZER = "jamba-instruct-tokenizer"
     JAMBA_TOKENIZER = "jamba-tokenizer"
+    JAMBA_1_5_MINI_TOKENIZER = "jamba-1.5-mini-tokenizer"
+    JAMBA_1_5_LARGE_TOKENIZER = "jamba-1.5-large-tokenizer"
 
 
 class TokenizerFactory:
@@ -27,6 +45,14 @@ def get_tokenizer(
         cls,
         tokenizer_name: str = PreTrainedTokenizers.J2_TOKENIZER,
     ) -> BaseTokenizer:
+        cache_dir = _get_cache_dir(tokenizer_name=tokenizer_name)
+
+        if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER:
+            return Jamba1_5Tokenizer(model_path=JAMBA_1_5_MINI_TOKENIZER_HF_PATH, cache_dir=cache_dir)
+
+        if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER:
+            return Jamba1_5Tokenizer(model_path=JAMBA_1_5_LARGE_TOKENIZER_HF_PATH, cache_dir=cache_dir)
+
         if (
             tokenizer_name == PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER
             or tokenizer_name == PreTrainedTokenizers.JAMBA_TOKENIZER
@@ -43,6 +69,16 @@ async def get_async_tokenizer(
         cls,
         tokenizer_name: str = PreTrainedTokenizers.J2_TOKENIZER,
     ) -> AsyncBaseTokenizer:
+        cache_dir = _get_cache_dir(tokenizer_name=tokenizer_name)
+
+        if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER:
+            return await AsyncJamba1_5Tokenizer.create(model_path=JAMBA_1_5_MINI_TOKENIZER_HF_PATH, cache_dir=cache_dir)
+
+        if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER:
+            return await AsyncJamba1_5Tokenizer.create(
+                model_path=JAMBA_1_5_LARGE_TOKENIZER_HF_PATH, cache_dir=cache_dir
+            )
+
         if (
             tokenizer_name == PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER
             or tokenizer_name == PreTrainedTokenizers.JAMBA_TOKENIZER

diff --git a/examples/async_jamba_1_5_tokenizer.py b/examples/async_jamba_1_5_tokenizer.py
@@ -0,0 +1,18 @@
+import asyncio
+
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+
+async def main():
+    tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER)
+
+    example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!"
+    encoded = await tokenizer.encode(example_sentence)
+    decoded = await tokenizer.decode(encoded)
+
+    assert decoded == example_sentence
+    print("Example sentence: " + example_sentence)
+    print("Encoded and decoded: " + decoded)
+
+
+asyncio.run(main())
diff --git a/examples/jamba_1_5_tokenizer.py b/examples/jamba_1_5_tokenizer.py
@@ -0,0 +1,11 @@
+from ai21_tokenizer import Jamba1_5Tokenizer
+
+model_path = "ai21labs/AI21-Jamba-1.5-Mini"
+
+tokenizer = Jamba1_5Tokenizer(model_path=model_path)
+
+example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!"
+encoded = tokenizer.encode(example_sentence)
+decoded = tokenizer.decode(encoded)
+
+assert decoded == example_sentence