Skip to content

Commit

Permalink
Add possessive quantifiers to legacy encodings as well
Browse files Browse the repository at this point in the history
  • Loading branch information
Lőrinc committed Feb 12, 2024
1 parent 4bf478b commit ccd8702
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
1 change: 1 addition & 0 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES


@pytest.mark.skip(reason="Takes a really long time to finish, but was added to reproduce a crash.")
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]):
enc = make_enc()
Expand Down
10 changes: 5 additions & 5 deletions tiktoken_ext/openai_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def gpt2():
# The pattern in the original GPT-2 release is:
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
# This is equivalent, but executes faster:
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
Expand All @@ -34,7 +34,7 @@ def r50k_base():
return {
"name": "r50k_base",
"explicit_n_vocab": 50257,
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
Expand All @@ -48,7 +48,7 @@ def p50k_base():
return {
"name": "p50k_base",
"explicit_n_vocab": 50281,
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
Expand All @@ -62,7 +62,7 @@ def p50k_edit():
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
return {
"name": "p50k_edit",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
Expand All @@ -82,7 +82,7 @@ def cl100k_base():
}
return {
"name": "cl100k_base",
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s*[\r\n]|\s+(?!\S)|\s++""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
Expand Down

0 comments on commit ccd8702

Please sign in to comment.