Skip to content

Commit

Permalink
feat: add jamba 1.5 tokenizers (mini and large) (#96)
Browse files Browse the repository at this point in the history
* feat: add jamba 1.5 tokenizers (mini and large)

* test: fix value error on conftest.py

* refactor: rename file name

* chore: fix path

* fix: enter
  • Loading branch information
miri-bar authored Aug 21, 2024
1 parent 5be5b59 commit e052b22
Show file tree
Hide file tree
Showing 12 changed files with 612 additions and 14 deletions.
62 changes: 59 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,63 @@ poetry add ai21-tokenizer

### Tokenizer Creation

### Jamba Tokenizer
### Jamba 1.5 Mini Tokenizer

```python
from ai21_tokenizer import Tokenizer, PreTrainedTokenizers

tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER)
# Your code here
```

Another way would be to use our Jamba 1.5 Mini tokenizer directly:

```python
from ai21_tokenizer import Jamba1_5Tokenizer

model_path = "<Path to your vocabs file>"
tokenizer = Jamba1_5Tokenizer(model_path=model_path)
# Your code here
```

#### Async usage

```python
from ai21_tokenizer import Tokenizer, PreTrainedTokenizers

tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER)
# Your code here
```

### Jamba 1.5 Large Tokenizer

```python
from ai21_tokenizer import Tokenizer, PreTrainedTokenizers

tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER)
# Your code here
```

Another way would be to use our Jamba 1.5 Large tokenizer directly:

```python
from ai21_tokenizer import Jamba1_5Tokenizer

model_path = "<Path to your vocabs file>"
tokenizer = Jamba1_5Tokenizer(model_path=model_path)
# Your code here
```

#### Async usage

```python
from ai21_tokenizer import Tokenizer, PreTrainedTokenizers

tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER)
# Your code here
```

### Jamba Instruct Tokenizer

```python
from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
Expand All @@ -59,7 +115,7 @@ tokenizer = JambaInstructTokenizer(model_path=model_path)
```python
from ai21_tokenizer import Tokenizer, PreTrainedTokenizers

tokenizer = Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
# Your code here
```

Expand Down Expand Up @@ -97,7 +153,7 @@ tokenizer = JurassicTokenizer(model_path=model_path, config=config)
```python
from ai21_tokenizer import Tokenizer

tokenizer = Tokenizer.get_async_tokenizer()
tokenizer = await Tokenizer.get_async_tokenizer()
# Your code here
```

Expand Down
3 changes: 3 additions & 0 deletions ai21_tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer
from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer, AsyncJurassicTokenizer
from ai21_tokenizer.tokenizer_factory import TokenizerFactory as Tokenizer, PreTrainedTokenizers
from ai21_tokenizer.jamba_1_5_tokenizer import Jamba1_5Tokenizer, AsyncJamba1_5Tokenizer
from .version import VERSION

__version__ = VERSION
Expand All @@ -16,4 +17,6 @@
"PreTrainedTokenizers",
"JambaInstructTokenizer",
"AsyncJambaInstructTokenizer",
"Jamba1_5Tokenizer",
"AsyncJamba1_5Tokenizer",
]
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import os
import tempfile
from pathlib import Path
from typing import List, Union, Optional
from abc import ABC, abstractmethod
Expand All @@ -11,10 +10,9 @@
from ai21_tokenizer.file_utils import PathLike

_TOKENIZER_FILE = "tokenizer.json"
_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_instruct"


class BaseJambaInstructTokenizer(ABC):
class BaseJambaTokenizer(ABC):
_tokenizer: Optional[Tokenizer] = None

@abstractmethod
Expand Down
148 changes: 148 additions & 0 deletions ai21_tokenizer/jamba_1_5_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from __future__ import annotations

import tempfile
from pathlib import Path
from typing import Union, List, Optional, cast

from tokenizers import Tokenizer

from ai21_tokenizer import BaseTokenizer, AsyncBaseTokenizer
from ai21_tokenizer.file_utils import PathLike
from ai21_tokenizer.base_jamba_tokenizer import BaseJambaTokenizer

_TOKENIZER_FILE = "tokenizer.json"
_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_1_5"


class Jamba1_5Tokenizer(BaseJambaTokenizer, BaseTokenizer):
def __init__(
self,
model_path: str,
cache_dir: Optional[PathLike] = None,
):
"""
Args:
model_path: str
The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file
cache_dir: Optional[PathLike]
The directory to cache the tokenizer.json file.
If not provided, the default cache directory will be used
"""
self._tokenizer = self._init_tokenizer(model_path=model_path, cache_dir=cache_dir or _DEFAULT_MODEL_CACHE_DIR)

def _init_tokenizer(self, model_path: PathLike, cache_dir: PathLike) -> Tokenizer:
if self._is_cached(cache_dir):
return self._load_from_cache(cache_dir / _TOKENIZER_FILE)

tokenizer = cast(
Tokenizer,
Tokenizer.from_pretrained(model_path),
)
self._cache_tokenizer(tokenizer, cache_dir)

return tokenizer

def _load_from_cache(self, cache_file: Path) -> Tokenizer:
return cast(Tokenizer, Tokenizer.from_file(str(cache_file)))

def encode(self, text: str, **kwargs) -> List[int]:
return self._encode(text, **kwargs)

def decode(self, token_ids: List[int], **kwargs) -> str:
return self._decode(token_ids, **kwargs)

def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
return self._convert_tokens_to_ids(tokens)

def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> Union[str, List[str]]:
return self._convert_ids_to_tokens(token_ids)

@property
def vocab_size(self) -> int:
return self._tokenizer.get_vocab_size()


class AsyncJamba1_5Tokenizer(BaseJambaTokenizer, AsyncBaseTokenizer):
_model_path: str
_tokenizer: Tokenizer = None
_cache_dir: PathLike = None

def __init__(self):
raise ValueError(
"Do not create AsyncJamba1_5Tokenizer directly. Use either AsyncJamba1_5Tokenizer.create or "
"Tokenizer.get_async_tokenizer"
)

@classmethod
async def create(
cls,
model_path: str,
cache_dir: Optional[PathLike] = None,
):
"""
Args:
model_path: str
The identifier of a Model on the Hugging Face Hub, that contains a tokenizer.json file
cache_dir: Optional[PathLike]
The directory to cache the tokenizer.json file.
If not provided, the default cache directory will be used
"""
self = cls.__new__(cls)
self._model_path = model_path
self._cache_dir = cache_dir or _DEFAULT_MODEL_CACHE_DIR
await self._init_tokenizer()
return self

async def encode(self, text: str, **kwargs) -> List[int]:
if not self._tokenizer:
await self._init_tokenizer()

return await self._make_async_call(callback_func=self._encode, text=text, **kwargs)

async def decode(self, token_ids: List[int], **kwargs) -> str:
if not self._tokenizer:
await self._init_tokenizer()

return await self._make_async_call(callback_func=self._decode, token_ids=token_ids, **kwargs)

async def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
if not self._tokenizer:
await self._init_tokenizer()

return await self._make_async_call(callback_func=self._convert_tokens_to_ids, tokens=tokens)

async def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> Union[str, List[str]]:
if not self._tokenizer:
await self._init_tokenizer()

return await self._make_async_call(callback_func=self._convert_ids_to_tokens, token_ids=token_ids, **kwargs)

@property
def vocab_size(self) -> int:
if not self._tokenizer:
raise ValueError(
"Tokenizer not properly initialized. Please do not initialize the tokenizer directly. Use "
"Tokenizer.get_async_tokenizer instead."
)
return self._tokenizer.get_vocab_size()

async def _init_tokenizer(self):
if self._is_cached(self._cache_dir):
self._tokenizer = await self._load_from_cache(self._cache_dir / _TOKENIZER_FILE)
else:
tokenizer_from_pretrained = await self._make_async_call(
callback_func=Tokenizer.from_pretrained,
identifier=self._model_path,
)

tokenizer = cast(
Tokenizer,
tokenizer_from_pretrained,
)
self._cache_tokenizer(tokenizer, self._cache_dir)

self._tokenizer = tokenizer

async def _load_from_cache(self, cache_file: Path) -> Tokenizer:
tokenizer_from_file = await self._make_async_call(callback_func=Tokenizer.from_file, path=str(cache_file))
return cast(Tokenizer, tokenizer_from_file)
6 changes: 3 additions & 3 deletions ai21_tokenizer/jamba_instruct_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@

from ai21_tokenizer import BaseTokenizer, AsyncBaseTokenizer
from ai21_tokenizer.file_utils import PathLike
from ai21_tokenizer.base_jamba_instruct_tokenizer import BaseJambaInstructTokenizer
from ai21_tokenizer.base_jamba_tokenizer import BaseJambaTokenizer

_TOKENIZER_FILE = "tokenizer.json"
_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_instruct"


class JambaInstructTokenizer(BaseJambaInstructTokenizer, BaseTokenizer):
class JambaInstructTokenizer(BaseJambaTokenizer, BaseTokenizer):
def __init__(
self,
model_path: str,
Expand Down Expand Up @@ -62,7 +62,7 @@ def vocab_size(self) -> int:
return self._tokenizer.get_vocab_size()


class AsyncJambaInstructTokenizer(BaseJambaInstructTokenizer, AsyncBaseTokenizer):
class AsyncJambaInstructTokenizer(BaseJambaTokenizer, AsyncBaseTokenizer):
_model_path: str
_tokenizer: Tokenizer = None
_cache_dir: PathLike = None
Expand Down
36 changes: 36 additions & 0 deletions ai21_tokenizer/tokenizer_factory.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,37 @@
import os
import tempfile
from pathlib import Path

from ai21_tokenizer.base_tokenizer import BaseTokenizer, AsyncBaseTokenizer
from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer
from ai21_tokenizer.jamba_1_5_tokenizer import Jamba1_5Tokenizer, AsyncJamba1_5Tokenizer
from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer, AsyncJurassicTokenizer

_LOCAL_RESOURCES_PATH = Path(__file__).parent / "resources"
_ENV_CACHE_DIR_KEY = "AI21_TOKENIZER_CACHE_DIR"
JAMBA_TOKENIZER_HF_PATH = "ai21labs/Jamba-v0.1"
JAMBA_1_5_MINI_TOKENIZER_HF_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
JAMBA_1_5_LARGE_TOKENIZER_HF_PATH = "ai21labs/AI21-Jamba-1.5-Large"


def _get_cache_dir(tokenizer_name: str) -> Path:
tokenizer_name_as_path = tokenizer_name.replace(".", "_")
tokenizer_name_as_path = tokenizer_name_as_path.replace("-", "_")
default_tokenizer_cache_dir = Path(tempfile.gettempdir()) / tokenizer_name_as_path
env_cache_from_env = os.getenv(_ENV_CACHE_DIR_KEY)

if env_cache_from_env is not None:
return Path(env_cache_from_env)

return default_tokenizer_cache_dir


class PreTrainedTokenizers:
J2_TOKENIZER = "j2-tokenizer"
JAMBA_INSTRUCT_TOKENIZER = "jamba-instruct-tokenizer"
JAMBA_TOKENIZER = "jamba-tokenizer"
JAMBA_1_5_MINI_TOKENIZER = "jamba-1.5-mini-tokenizer"
JAMBA_1_5_LARGE_TOKENIZER = "jamba-1.5-large-tokenizer"


class TokenizerFactory:
Expand All @@ -27,6 +45,14 @@ def get_tokenizer(
cls,
tokenizer_name: str = PreTrainedTokenizers.J2_TOKENIZER,
) -> BaseTokenizer:
cache_dir = _get_cache_dir(tokenizer_name=tokenizer_name)

if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER:
return Jamba1_5Tokenizer(model_path=JAMBA_1_5_MINI_TOKENIZER_HF_PATH, cache_dir=cache_dir)

if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER:
return Jamba1_5Tokenizer(model_path=JAMBA_1_5_LARGE_TOKENIZER_HF_PATH, cache_dir=cache_dir)

if (
tokenizer_name == PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER
or tokenizer_name == PreTrainedTokenizers.JAMBA_TOKENIZER
Expand All @@ -43,6 +69,16 @@ async def get_async_tokenizer(
cls,
tokenizer_name: str = PreTrainedTokenizers.J2_TOKENIZER,
) -> AsyncBaseTokenizer:
cache_dir = _get_cache_dir(tokenizer_name=tokenizer_name)

if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER:
return await AsyncJamba1_5Tokenizer.create(model_path=JAMBA_1_5_MINI_TOKENIZER_HF_PATH, cache_dir=cache_dir)

if tokenizer_name == PreTrainedTokenizers.JAMBA_1_5_LARGE_TOKENIZER:
return await AsyncJamba1_5Tokenizer.create(
model_path=JAMBA_1_5_LARGE_TOKENIZER_HF_PATH, cache_dir=cache_dir
)

if (
tokenizer_name == PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER
or tokenizer_name == PreTrainedTokenizers.JAMBA_TOKENIZER
Expand Down
18 changes: 18 additions & 0 deletions examples/async_jamba_1_5_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import asyncio

from ai21_tokenizer import Tokenizer, PreTrainedTokenizers


async def main():
tokenizer = await Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_1_5_MINI_TOKENIZER)

example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!"
encoded = await tokenizer.encode(example_sentence)
decoded = await tokenizer.decode(encoded)

assert decoded == example_sentence
print("Example sentence: " + example_sentence)
print("Encoded and decoded: " + decoded)


asyncio.run(main())
11 changes: 11 additions & 0 deletions examples/jamba_1_5_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from ai21_tokenizer import Jamba1_5Tokenizer

model_path = "ai21labs/AI21-Jamba-1.5-Mini"

tokenizer = Jamba1_5Tokenizer(model_path=model_path)

example_sentence = "This sentence should be encoded and then decoded. Hurray!!!!"
encoded = tokenizer.encode(example_sentence)
decoded = tokenizer.decode(encoded)

assert decoded == example_sentence
Loading

0 comments on commit e052b22

Please sign in to comment.