Skip to content

Commit

Permalink
feat: Add the is_start parameter to JurassicTokenizer.encode() (#72)
Browse files Browse the repository at this point in the history
* feat: Add the is_start parameter to JurassicTokenizer.encode()

* refactor: take 'is_start' from kwargs
  • Loading branch information
tomeras91 authored Dec 28, 2023
1 parent 96f384f commit 296bda5
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion ai21_tokenizer/jurassic_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def encode(self, text: str, **kwargs) -> List[int]:
"""
Tokenizes the input text and returns it's token ids
"""
is_start = kwargs.get("is_start", True)
lines = text.split("\n")
toks = []

Expand All @@ -163,7 +164,7 @@ def encode(self, text: str, **kwargs) -> List[int]:
if not line:
continue
# We add the dummy prefix on every newline, and also for the 1st line if it's a 'start'
if self._manual_add_dummy_prefix and i >= 0:
if self._manual_add_dummy_prefix and (i > 0 or (i == 0 and is_start)):
line = " " + line
toks.extend(self._encode(line))

Expand Down

0 comments on commit 296bda5

Please sign in to comment.