Skip to content

Commit

Permalink
hyperparameters added
Browse files Browse the repository at this point in the history
  • Loading branch information
Hk669 committed May 29, 2024
1 parent 75644b1 commit 50c86d9
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 10 deletions.
21 changes: 15 additions & 6 deletions bpetokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ def _build_vocab(self) -> dict:
vocab[idx] = special.encode("utf-8")
return vocab

def save(self, file_name, mode="file"):
def save(self, file_name, mode="json"):
"""
Writes metadata and vocabulary information to the model and vocab files.
mode: str, default="file" | "json" to save the model and vocab in json format.
mode: str, default="json" | "file" to save the model and vocab in file format.
"""
if mode == "file":
model_file = file_name + ".model"
Expand Down Expand Up @@ -127,10 +127,10 @@ def save(self, file_name, mode="file"):
raise ValueError("mode should be either 'file' or 'json'")


def load(self, file_name, mode="file"):
def load(self, file_name, mode="json"):
"""
Load the model and vocab files to the tokenizer.
mode: str, default="file" | "json" to load the model and vocab in json format.
mode: str, default="json" | "file" to load the model and vocab in file format.
"""
if mode == "file":
assert file_name.endswith(".model")
Expand Down Expand Up @@ -196,8 +196,15 @@ def decode(self, ids):
text = bytes_str.decode("utf-8", errors="replace")
return text

def train(self, texts, vocab_size, verbose=False):
"""Method for training the tokenizer."""
def train(self, texts, vocab_size, verbose=False, min_frequency=2):
"""
Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256.
params:
texts: str (the texts required for the tokenizer to train the vocabulary.)
vocab_size: int (the size of the vocab, gpt4 vocab size is around 100k)
verbose: bool (to get extra visibilty and the overview of internal processes)
min_frequency: int (the minimum frequency of the pair to be merged and added into the vocab as a new token)
"""
assert vocab_size >= 256
num_merges = vocab_size - 256

Expand All @@ -212,6 +219,8 @@ def train(self, texts, vocab_size, verbose=False):
pair = max(stats, key=stats.get) # returns the highest frequency pair
idx = 256 + i

if stats[pair] < min_frequency:
break
ids = merge(ids, pair, idx)
merges[pair] = idx
vocab[idx] = vocab[pair[0]] + vocab[pair[1]] # concat of bytes
Expand Down
14 changes: 11 additions & 3 deletions bpetokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,15 @@ def __init__(self, pattern=None, special_tokens=None):
self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()}


def train(self, texts, vocab_size, verbose=False) -> None:
"""Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256."""
def train(self, texts, vocab_size, verbose=False, min_frequency=2) -> None:
"""
Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256.
params:
texts: str (the texts required for the tokenizer to train the vocabulary.)
vocab_size: int (the size of the vocab, gpt4 vocab size is around 100k)
verbose: bool (to get extra visibilty and the overview of internal processes)
min_frequency: int (the minimum frequency of the pair to be merged and added into the vocab as a new token)
"""
assert vocab_size >= 256
num_merges = vocab_size - 256

Expand All @@ -48,7 +55,8 @@ def train(self, texts, vocab_size, verbose=False) -> None:
stats = {}
for chunk in ids:
get_stats(chunk, stats)

if stats[pair] < min_frequency:
break
pair = max(stats, key=stats.get) # returns the highest frequency pair
idx = 256 + i

Expand Down
2 changes: 1 addition & 1 deletion bpetokenizer/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.31"
__version__ = "1.0.32"

0 comments on commit 50c86d9

Please sign in to comment.