Add llama3 tokenizer export script and tokenizer binary

The llama3 tokenizer export script is taken from @jameswdelancey 's llama3.c project: https://github.com/jameswdelancey/llama3.c Much appreciation & credits to @jameswdelancey
trholding · Jul 10, 2024 · 5d981db · 5d981db
1 parent 1be98e2
commit 5d981db
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -603,7 +603,7 @@ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin -O o
 
 ## Changelog
 
-See commits.
+See commits. Going forward we will diverge wildly from karpathy llama2.c
 
 ## Contributing
 
@@ -615,25 +615,30 @@ See commits.
 
 See "Developer Status" issue.
 
-Current status: Busy since Aug ~6 2023, away on bigger IRL projects. Just merging stuff. Addressing all issues every ~7 days.
-
 # Gratitude & Credits
 
 Thank you to to the creators of the following libraries and tools and their contributors:
 
-- [llama2.c](https://github.com/karpathy/llama2.c) - @karpathy
-- [cosmopolitan](https://github.com/jart/cosmopolitan) - @jart
-- [OpenBlas](https://github.com/xianyi/OpenBLAS) - @xianyi
-- [blis](https://github.com/flame/blis) - @flame
-- [CLBlast](https://github.com/CNugteren/CLBlast) - @CNugteren
-- [incbin](https://github.com/graphitemaster/incbin) - @graphitemaster
-- [strliteral](https://github.com/mortie/strliteral) - @mortie
-- [unikraft](https://github.com/unikraft) - @unikraft
-
-
-## Notable projects
+- [Meta] (https://llama.meta.com/) - @facebook - Creators of llama2 and llama3
+- [llama2.c](https://github.com/karpathy/llama2.c) - @karpathy - The initiator and guru
+- [cosmopolitan](https://github.com/jart/cosmopolitan) - @jart - Toolchain that makes write once run anyehere possible
+- [OpenBlas](https://github.com/xianyi/OpenBLAS) - @xianyi - BLAS acceleration
+- [blis](https://github.com/flame/blis) - @flame - BLIS BLAS acceleration
+- [CLBlast](https://github.com/CNugteren/CLBlast) - @CNugteren - OpenCL BLAS acceleration
+- [incbin](https://github.com/graphitemaster/incbin) - @graphitemaster - Include assets in binaries
+- [strliteral](https://github.com/mortie/strliteral) - @mortie - Include assets in binaries
+- [unikraft](https://github.com/unikraft) - @unikraft - Run as unikernel
+- [linux](https://https://www.kernel.org/) - @torvalds - Kernel used in L2E OS
+- [limine](https://github.com/limine-bootloader/limine) - @mintsuki - Bootloader for L2E OS
+- [llama3.c](https://github.com/jameswdelancey/llama3.c) - @jameswdelancey - export script for llama tokenizer
+- Many more 
+
+
+## Other cool and notable projects
 - [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - [llama2.c](https://github.com/karpathy/llama2.c)
+- [llamafile](https://github.com/Mozilla-Ocho/llamafile)
+- [llama3.c](https://github.com/jameswdelancey/llama3.c)
 
 ## License
 

diff --git a/llama3_export/requirements.txt b/llama3_export/requirements.txt
@@ -0,0 +1,8 @@
+blobfile==2.1.1
+numpy
+pytest==8.2.0
+Requests
+tiktoken==0.6.0
+torch
+tqdm
+wandb==0.16.6
diff --git a/llama3_export/tokenizer.py b/llama3_export/tokenizer.py
@@ -0,0 +1,115 @@
+# Taken from llama code and lightly modified
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
+
+import array
+import os
+import struct
+import argparse
+from pathlib import Path
+from typing import List
+
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+
+TOKENIZER_MODEL = "tokenizer.model"  # the llama tiktoken tokenizer model
+
+
+class Tokenizer:
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+
+    def __init__(self, tokenizer_model=None):
+        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
+        assert os.path.isfile(model_path), model_path
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        self.model_path = model_path
+
+        # BOS / EOS token IDs
+        num_base_tokens = len(mergeable_ranks)
+        num_reserved_special_tokens = 256
+
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.n_words = self.model.n_vocab
+        self.bos_id = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id = self.special_tokens["<|end_of_text|>"]
+        self.pad_id = -1
+        self.stop_tokens = {
+            self.special_tokens["<|end_of_text|>"],
+            self.special_tokens["<|eot_id|>"],
+        }
+
+    def encode(
+        self, s: str, bos: bool, eos: bool, allowed_special, disallowed_special
+    ) -> List[int]:
+        assert type(s) is str
+        self.model.encode(
+            substr,
+            allowed_special=allowed_special,
+            disallowed_special=disallowed_special,
+        )
+
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        return self.model.decode(t)
+
+    def export(self):
+
+        # get all the tokens (postprocessed) and their scores as floats
+        tokens, scores = [], []
+        for i in range(self.n_words):
+
+            # decode the token and light postprocessing
+            t = self.model.decode_single_token_bytes(i)
+            s = i
+            tokens.append(t)
+            scores.append(s)
+
+        # record the max token length
+        max_token_length = max(len(t) for t in tokens)
+
+        # write to a binary file
+        # the tokenizer.bin file is the same as .model file, but .bin
+        tokenizer_bin = self.model_path.replace(".model", ".bin")
+        with open(tokenizer_bin, "wb") as f:
+            f.write(struct.pack("I", max_token_length))
+            for bytes, score in zip(tokens, scores):
+                f.write(struct.pack("fI", score, len(bytes)))
+                f.write(bytes)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
+
+    args = parser.parse_args()
+
+    t = Tokenizer(args.tokenizer_model)
+    t.export()
diff --git a/tokenizer_l3.bin b/tokenizer_l3.bin