Skip to content

Commit

Permalink
Sync codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed Jun 1, 2023
1 parent 095924e commit affbd6e
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 1 deletion.
25 changes: 25 additions & 0 deletions tiktoken/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,31 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
"""
return [self.decode_single_token_bytes(token) for token in tokens]

def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]:
"""Decodes a list of tokens into a string and a list of offsets.
Each offset is the index into text corresponding to the start of each token.
If UTF-8 character boundaries do not line up with token boundaries, the offset is the index
of the first character that contains bytes from the token.
This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may
change in the future to be more permissive.
>>> enc.decode_with_offsets([31373, 995])
('hello world', [0, 5])
"""
token_bytes = self.decode_tokens_bytes(tokens)

text_len = 0
offsets = []
for token in token_bytes:
offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))
text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)

# TODO: assess correctness for errors="ignore" and errors="replace"
text = b"".join(token_bytes).decode("utf-8", errors="strict")
return text, offsets

def decode_batch(
self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
) -> list[str]:
Expand Down
4 changes: 3 additions & 1 deletion tiktoken/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
# chat
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo": "cl100k_base", # Azure deployment name
}

MODEL_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"gpt-35-turbo": "cl100k_base", # Azure deployment name
# text
"text-davinci-003": "p50k_base",
"text-davinci-002": "p50k_base",
Expand Down Expand Up @@ -69,7 +71,7 @@ def encoding_for_model(model_name: str) -> Encoding:
if encoding_name is None:
raise KeyError(
f"Could not automatically map {model_name} to a tokeniser. "
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
) from None

return get_encoding(encoding_name)

0 comments on commit affbd6e

Please sign in to comment.