AI21Labs · miri-bar · Jun 18, 2024 · Jun 16, 2024 · Jun 16, 2024 · Jun 16, 2024
@@ -35,6 +35,46 @@ poetry add ai21-tokenizer
 
 ### Tokenizer Creation
 
+### Jamba Tokenizer
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
+# Your code here
+```
+
+Another way would be to use our Jamba tokenizer directly:
+
+```python
+from ai21_tokenizer import JambaInstructTokenizer
+
+model_path = "<Path to your vocabs file>"
+tokenizer = JambaInstructTokenizer(model_path=model_path)
+# Your code here
+```
+
+#### Async usage
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
+# Your code here
+```
+
+Another way would be to use our async Jamba tokenizer class method create:
+
+```python
+from ai21_tokenizer import AsyncJambaInstructTokenizer
+
+model_path = "<Path to your vocabs file>"
+tokenizer = AsyncJambaInstructTokenizer.create(model_path=model_path)
+# Your code here
+```
+
+### J2 Tokenizer
+
 ```python
 from ai21_tokenizer import Tokenizer
 
@@ -52,6 +92,26 @@ config = {} # "dictionary object of your config.json file"
 tokenizer = JurassicTokenizer(model_path=model_path, config=config)
 ```
 
+#### Async usage
+
+```python
+from ai21_tokenizer import Tokenizer
+
+tokenizer = Tokenizer.get_async_tokenizer()
+# Your code here
+```
+
+Another way would be to use our async Jamba tokenizer class method create:
+
+```python
+from ai21_tokenizer import AsyncJurassicTokenizer
+
+model_path = "<Path to your vocabs file. This is usually a binary file that end with .model>"
+config = {} # "dictionary object of your config.json file"
+tokenizer = AsyncJurassicTokenizer.create(model_path=model_path, config=config)
+# Your code here
+```
+
 ### Functions
 
 #### Encode and Decode
@@ -67,6 +127,18 @@ decoded_text = tokenizer.decode(encoded_text)
 print(f"Decoded text: {decoded_text}")
 ```
 
+#### Async
+
+```python
+# Assuming you have created an async tokenizer
+text_to_encode = "apple orange banana"
+encoded_text = await tokenizer.encode(text_to_encode)
+print(f"Encoded text: {encoded_text}")
+
+decoded_text = await tokenizer.decode(encoded_text)
+print(f"Decoded text: {decoded_text}")
+```
+
 #### What if you had wanted to convert your tokens to ids or vice versa?
 
 ```python
@@ -76,4 +148,14 @@ print(f"IDs corresponds to Tokens: {tokens}")
 ids = tokenizer.convert_tokens_to_ids(tokens)
 ```
 
+#### Async
+
+```python
+# Assuming you have created an async tokenizer
+tokens = await tokenizer.convert_ids_to_tokens(encoded_text)
+print(f"IDs corresponds to Tokens: {tokens}")
+
+ids = tokenizer.convert_tokens_to_ids(tokens)
+```
+
 **For more examples, please see our [examples](examples) folder.**
@@ -1,6 +1,6 @@
-from ai21_tokenizer.base_tokenizer import BaseTokenizer
-from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer
-from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer
+from ai21_tokenizer.base_tokenizer import BaseTokenizer, AsyncBaseTokenizer
+from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer
+from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer, AsyncJurassicTokenizer
 from ai21_tokenizer.tokenizer_factory import TokenizerFactory as Tokenizer, PreTrainedTokenizers
 from .version import VERSION
 
@@ -9,8 +9,11 @@
 __all__ = [
     "Tokenizer",
     "JurassicTokenizer",
+    "AsyncJurassicTokenizer",
     "BaseTokenizer",
+    "AsyncBaseTokenizer",
     "__version__",
     "PreTrainedTokenizers",
     "JambaInstructTokenizer",
+    "AsyncJambaInstructTokenizer",
 ]
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+from typing import List, Union, Optional
+from abc import ABC, abstractmethod
+
+from tokenizers import Tokenizer
+
+from ai21_tokenizer.file_utils import PathLike
+
+_TOKENIZER_FILE = "tokenizer.json"
+_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_instruct"
+
+
+class BaseJambaInstructTokenizer(ABC):
+    _tokenizer: Optional[Tokenizer] = None
+
+    @abstractmethod
+    def _load_from_cache(self, cache_file: Path) -> Tokenizer:
+        pass
+
+    def _is_cached(self, cache_dir: PathLike) -> bool:
+        return Path(cache_dir).exists() and _TOKENIZER_FILE in os.listdir(cache_dir)
+
+    def _cache_tokenizer(self, tokenizer: Tokenizer, cache_dir: PathLike) -> None:
+        # create cache directory for caching the tokenizer and save it
+        Path(cache_dir).mkdir(parents=True, exist_ok=True)
+        tokenizer.save(str(cache_dir / _TOKENIZER_FILE))
+
+    def _encode(self, text: str, **kwargs) -> List[int]:
+        return self._tokenizer.encode(text, **kwargs).ids
+
+    def _decode(self, token_ids: List[int], **kwargs) -> str:
+        return self._tokenizer.decode(token_ids, **kwargs)
+
+    def _convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            return self._tokenizer.token_to_id(tokens)
+
+        return [self._tokenizer.token_to_id(token) for token in tokens]
+
+    def _convert_ids_to_tokens(self, token_ids: Union[int, List[int]]) -> Union[str, List[str]]:
+        if isinstance(token_ids, int):
+            return self._tokenizer.id_to_token(token_ids)
+
+        return [self._tokenizer.id_to_token(token_id) for token_id in token_ids]