diff --git a/doc/stories260K.md b/doc/stories260K.md new file mode 100644 index 00000000..c17b985d --- /dev/null +++ b/doc/stories260K.md @@ -0,0 +1,58 @@ +# stories260K + +[Stories260K huggginface link](https://huggingface.co/karpathy/tinyllamas) + +The 260K model is a tiny model used for testing, and was trained as follows: + +``` +python train.py \ + --out_dir="outmini" \ + --batch_size=128 \ + --max_seq_len=512 \ + --gradient_accumulation_steps=1 \ + --vocab_source="custom" \ + --vocab_size=512 \ + --dim=64 \ + --n_layers=5 \ + --n_heads=8 \ + --n_kv_heads=4 \ + --multiple_of=4 \ + --learning_rate=1e-3 \ + --dropout=0.05 \ + --weight_decay=0.01 \ + --max_iters=100000 \ + --beta2=0.99 \ + --warmup_iters=1000 \ + --eval_interval=2000 \ + --eval_iters=100 \ + --compile=True +``` + +You'll notice that `n_kv_heads` is 4 while `n_heads` is 8, so two heads at a time share their key,value projections, i.e. this model is 2X multiquery. You'll also notice that we're using a custom tokenizer with 512 tokens. The model trained for ~10 minutes (?) on my A100 and achieves validation loss of 1.2968. + +Sampling this model at temperature 0.0 (i.e. deterministic greedy argmax sampling) gives: + +``` +$ ./run stories260K/stories260K.bin -z stories260K/tok512.bin -t 0.0 +Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high. +Lily's mom said, "Lily, let's go to the park." Lily was sad and didn't know what to do. She said, "I want to play with your ball, but I can't find it." +Lily was sad and didn't know what to do. She said, "I'm sorry, Lily. I didn't know what to do." +Lily didn't want to help her mom, so she said, "I'm sorry, mom. I didn't know what to do." Her mom said, "Don't worry, Lily. We can help you. +``` + +You can reproduce the same in Python by running `sample.py`: + +``` +$ python sample.py --checkpoint=stories260K/stories260K.pt --tokenizer=stories260K/tok512.model --temperature=0.0 --max_new_tokens=257 +``` + +I hardcoded max tokens to be 257 manually because the `sample.py` script doesn't currently terminate on the special BOS token like the run.c script does. Sampling at 1.0 with topp of 0.9 gives a bit more reasonable samples: + +``` +$ ./run stories260K/stories260K.bin -z stories260K/tok512.bin -t 1.0 -p 0.9 -s 133742 +Once upon a time, there was a little boy named Timmy. Timmy loved to play with his toys and eat sandwiches. One day, Timmy's mom told him it was time to rest for a while. Timmy's friend Billy came over and took him a down. +Timmy's mom saw that Timmy was sad, but Timmy said, "I didn't understand what is it! We need to find some leafs." Timmy thought about it and took a deep breath on a spoon. He hoped it was important to be kind and continued to find its image next time. +After they finished getting, Timmy's dad came up to his house and promised to help Timmy. +``` + +Hey you can't expect too much from a 260K parameter model. I'm even mildly shocked we get this far :D diff --git a/doc/train_llama_tokenizer.md b/doc/train_llama_tokenizer.md new file mode 100644 index 00000000..a03da557 --- /dev/null +++ b/doc/train_llama_tokenizer.md @@ -0,0 +1,99 @@ +# training llama tokenizer + +How does Meta train their sentencepiece tokenizer? You can print the config as follows: + +```python +import sentencepiece.sentencepiece_model_pb2 +mp = sentencepiece.sentencepiece_model_pb2.ModelProto() +mp.ParseFromString(open("tokenizer.model", "rb").read()) +print(mp.trainer_spec) +print(mp.normalizer_spec) +``` + +this gives: + +``` +trainer_spec { + input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged" + model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2" + model_type: BPE + vocab_size: 32000 + self_test_sample_size: 0 + input_format: "text" + character_coverage: 0.9999499917030334 + input_sentence_size: 200000000 + seed_sentencepiece_size: 1000000 + shrinking_factor: 0.75 + num_threads: 80 + num_sub_iterations: 2 + max_sentence_length: 4192 + shuffle_input_sentence: true + max_sentencepiece_length: 16 + split_by_unicode_script: true + split_by_whitespace: true + split_by_number: true + treat_whitespace_as_suffix: false + split_digits: true + allow_whitespace_only_pieces: true + vocabulary_output_piece_score: true + hard_vocab_limit: true + use_all_vocab: false + byte_fallback: true + required_chars: "" + unk_id: 0 + bos_id: 1 + eos_id: 2 + pad_id: -1 + unk_surface: " \342\201\207 " + unk_piece: "" + bos_piece: "" + eos_piece: "" + pad_piece: "" + train_extremely_large_corpus: false + enable_differential_privacy: false + differential_privacy_noise_level: 0.0 + differential_privacy_clipping_threshold: 0 +} +normalizer_spec { + name: "identity" + precompiled_charsmap: "" + add_dummy_prefix: true + remove_extra_whitespaces: false + normalization_rule_tsv: "" +} +``` + +We can use the sentencepiece spm_train to train the same models, but optionally smaller. Here are their [options docs](https://github.com/google/sentencepiece/blob/master/doc/options.md) we can refer to. It's not much but it helps. + +We'll depart on one setting, I recommend changing `character_coverage` -> 1.0. We also want to make sure to note the following important settings that come up in the paper and are not necessarily the default sentencepiece settings: + +``` +--split-digits = true +--allow_whitespace_only_pieces = true +--byte_fallback = true +--normalization_rule_name = identity +``` + +With this in mind we can train a sentencepiece vocab in what I believe is probably the same to how Meta trained theirs as: + +``` +spm_train --input="$input" \ + --model_prefix="$model_prefix" \ + --model_type=bpe \ + --vocab_size="$vocab_size" \ + --self_test_sample_size=0 \ + --input_format="text" \ + --character_coverage=1.0 \ + --num_threads="$(nproc)" \ + --split_digits=true \ + --allow_whitespace_only_pieces=true \ + --byte_fallback=true \ + --unk_surface=" \342\201\207 " \ + --normalization_rule_name=identity \ +``` + +Where $input is the input file, $model_prefix is the output path prefix, vocab_size is the desired vocab, and we're by default taking over the CPU resources of the machine. + +Lastly note that sentencepiece is weird and expects "sentences" delimited by newlines as the input. You can't just put in a massive block of text. And they have a hyperparameter that constols the maximum size of a "sentence". Fwiw I really dislike this design choice around a weird concept of a "sentence". It should just be block of text with no assumptions. But here we are. + +Look into the file `tinystories.py` where we train the vocab in the same way, but using Python bindings instead. diff --git a/export.py b/export.py index a60d7cfe..4143f70f 100644 --- a/export.py +++ b/export.py @@ -323,9 +323,10 @@ def concat_weights(models): config.multiple_of = params["multiple_of"] config.norm_eps = params["norm_eps"] - config.vocab_size = 32000 + config.vocab_size = state_dict['tok_embeddings.weight'].shape[0] config.max_seq_len = 2048 + # create a new Transformer object and set weights model = Transformer(config) diff --git a/test.c b/test.c index 23fca8c8..4203efde 100644 --- a/test.c +++ b/test.c @@ -73,6 +73,9 @@ void test_prompt_encodings() { char* prompt4 = "Translate English to French:\n\n sea otter => loutre de mer\n peppermint => menthe poivrée\n plush girafe => girafe peluche\n cheese =>"; int expected_tokens4[] = {1, 4103, 9632, 4223, 304, 5176, 29901, 13, 13, 4706, 7205, 4932, 357, 1149, 301, 449, 276, 316, 2778, 13, 4706, 1236, 407, 837, 524, 1149, 6042, 354, 772, 440, 29878, 1318, 13, 4706, 715, 1878, 330, 3055, 1725, 1149, 330, 3055, 1725, 4639, 28754, 13, 4706, 923, 968, 1149}; test_prompt_encoding(&tokenizer, prompt4, expected_tokens4, sizeof(expected_tokens4) / sizeof(int)); + + // memory and file handles cleanup + free_tokenizer(&tokenizer); } int main(int argc, char *argv[]) { diff --git a/train_vocab.sh b/train_vocab.sh deleted file mode 100755 index 7803af8b..00000000 --- a/train_vocab.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash - -# Trains a sentencepiece tokenizer model on a bunch of given data, my best -# effort attempt to replicate how Meta trained their Llama 2 tokenizer. - -# usage: $ train_vocab.sh -# example: -# ./train_vocab.sh tiny.txt tokenizer_tiny 1024 -# requirements: -# install https://github.com/google/sentencepiece - -# check if the correct number of arguments are provided -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -# assign command-line arguments to variables -input=$1 -model_prefix=$2 -vocab_size=$3 - -# check if input file exists -if [ ! -f "$input" ]; then - echo "Usage: $0 " - echo "input '$input' not found." - exit 1 -fi - -# check if vocab_size is a positive integer -if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then - echo "Usage: $0 " - echo "vocab_size size must be a positive integer." - exit 1 -fi - -# Print the processed inputs -echo "Input: $input" -echo "Model Prefix: $model_prefix" -echo "Vocabulary Size: $vocab_size" - -# train a sentencepiece tokenizer model -# Llama 2 config can be printed as follows: - -# import sentencepiece.sentencepiece_model_pb2 -# mp = sentencepiece.sentencepiece_model_pb2.ModelProto() -# mp.ParseFromString(open("tokenizer.model", "rb").read()) -# print(mp.trainer_spec) -# print(mp.normalizer_spec) - -# this gives: - -# trainer_spec { -# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged" -# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2" -# model_type: BPE -# vocab_size: 32000 -# self_test_sample_size: 0 -# input_format: "text" -# character_coverage: 0.9999499917030334 -# input_sentence_size: 200000000 -# seed_sentencepiece_size: 1000000 -# shrinking_factor: 0.75 -# num_threads: 80 -# num_sub_iterations: 2 -# max_sentence_length: 4192 -# shuffle_input_sentence: true -# max_sentencepiece_length: 16 -# split_by_unicode_script: true -# split_by_whitespace: true -# split_by_number: true -# treat_whitespace_as_suffix: false -# split_digits: true -# allow_whitespace_only_pieces: true -# vocabulary_output_piece_score: true -# hard_vocab_limit: true -# use_all_vocab: false -# byte_fallback: true -# required_chars: "" -# unk_id: 0 -# bos_id: 1 -# eos_id: 2 -# pad_id: -1 -# unk_surface: " \342\201\207 " -# unk_piece: "" -# bos_piece: "" -# eos_piece: "" -# pad_piece: "" -# train_extremely_large_corpus: false -# enable_differential_privacy: false -# differential_privacy_noise_level: 0.0 -# differential_privacy_clipping_threshold: 0 -# } -# normalizer_spec { -# name: "identity" -# precompiled_charsmap: "" -# add_dummy_prefix: true -# remove_extra_whitespaces: false -# normalization_rule_tsv: "" -# } - -# let's now use spm_train to train this exact model -# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md - -# we'll depart on a few settings: -# character_coverage -> 1.0 - -# other important notes: -# --split-digits = true, per the paper -# --allow_whitespace_only_pieces is true, default in spm is false -# --byte_fallback is true, default in spm is false -# --normalization_rule_name is identity, default in spm is nmt_nfkc - -spm_train --input="$input" \ - --model_prefix="$model_prefix" \ - --model_type=bpe \ - --vocab_size="$vocab_size" \ - --self_test_sample_size=0 \ - --input_format="text" \ - --character_coverage=1.0 \ - --num_threads="$(nproc)" \ - --split_digits=true \ - --allow_whitespace_only_pieces=true \ - --byte_fallback=true \ - --unk_surface=" \342\201\207 " \ - --normalization_rule_name=identity \