Skip to content

Commit

Permalink
Implementingthe BPE Tokenizer from Scratch (#487)
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt authored Jan 17, 2025
1 parent 2fef211 commit 0d4967e
Show file tree
Hide file tree
Showing 4 changed files with 1,463 additions and 86 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ Several folders contain optional materials as a bonus for interested readers:
- [Installing Python Packages and Libraries Used In This Book](setup/02_installing-python-libraries)
- [Docker Environment Setup Guide](setup/03_optional-docker-environment)
- **Chapter 2: Working with text data**
- [Byte Pair Encoding (BPE) Tokenizer From Scratch](ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb)
- [Comparing Various Byte Pair Encoding (BPE) Implementations](ch02/02_bonus_bytepair-encoder)
- [Understanding the Difference Between Embedding Layers and Linear Layers](ch02/03_bonus_embedding-vs-matmul)
- [Dataloader Intuition with Simple Numbers](ch02/04_bonus_dataloader-intuition)
Expand Down
4 changes: 3 additions & 1 deletion ch02/01_main-chapter-code/ch02.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1900,7 +1900,9 @@
"source": [
"See the [./dataloader.ipynb](./dataloader.ipynb) code notebook, which is a concise version of the data loader that we implemented in this chapter and will need for training the GPT model in upcoming chapters.\n",
"\n",
"See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions."
"See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions.\n",
"\n",
"See the [Byte Pair Encoding (BPE) Tokenizer From Scratch](../02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb) notebook if you are interested in learning how the GPT-2 tokenizer can be implemented and trained from scratch."
]
}
],
Expand Down
243 changes: 158 additions & 85 deletions ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"tiktoken version: 0.5.1\n"
"tiktoken version: 0.7.0\n"
]
}
],
Expand Down Expand Up @@ -180,8 +180,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching encoder.json: 1.04Mit [00:00, 3.14Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 1.67Mit/s] \n"
"Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s] \n"
]
}
],
Expand Down Expand Up @@ -259,7 +259,7 @@
{
"data": {
"text/plain": [
"'4.34.0'"
"'4.48.0'"
]
},
"execution_count": 12,
Expand All @@ -278,78 +278,7 @@
"execution_count": 13,
"id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e4df871bb797435787143a3abe6b0231",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f11b27a4aabf43af9bf57f929683def6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d3aa9a24aacc43108ef2ed72e7bacd33",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f9341bc23b594bb68dcf8954bff6d9bd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c5f55f2f1dbc4152acc9b2061167ee0a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading config.json: 0%| | 0.00/665 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"from transformers import GPT2Tokenizer\n",
"\n",
Expand Down Expand Up @@ -377,6 +306,100 @@
"hf_tokenizer(strings)[\"input_ids\"]"
]
},
{
"cell_type": "markdown",
"id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9",
"metadata": {},
"source": [
"<br>\n",
"&nbsp;\n",
"\n",
"## Using my own from-scratch BPE tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import io\n",
"import nbformat\n",
"import types\n",
"\n",
"def import_from_notebook():\n",
" def import_definitions_from_notebook(fullname, names):\n",
" current_dir = os.getcwd()\n",
" path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n",
" path = os.path.normpath(path)\n",
"\n",
" # Load the notebook\n",
" if not os.path.exists(path):\n",
" raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n",
"\n",
" with io.open(path, \"r\", encoding=\"utf-8\") as f:\n",
" nb = nbformat.read(f, as_version=4)\n",
"\n",
" # Create a module to store the imported functions and classes\n",
" mod = types.ModuleType(fullname)\n",
" sys.modules[fullname] = mod\n",
"\n",
" # Go through the notebook cells and only execute function or class definitions\n",
" for cell in nb.cells:\n",
" if cell.cell_type == \"code\":\n",
" cell_code = cell.source\n",
" for name in names:\n",
" # Check for function or class definitions\n",
" if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n",
" exec(cell_code, mod.__dict__)\n",
" return mod\n",
"\n",
" fullname = \"bpe-from-scratch\"\n",
" names = [\"BPETokenizerSimple\"]\n",
"\n",
" return import_definitions_from_notebook(fullname, names)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e",
"metadata": {},
"outputs": [],
"source": [
"imported_module = import_from_notebook()\n",
"BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n",
"\n",
"tokenizer_gpt2 = BPETokenizerSimple()\n",
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
" vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n",
" bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
]
}
],
"source": [
"integers = tokenizer_gpt2.encode(text)\n",
"\n",
"print(integers)"
]
},
{
"cell_type": "markdown",
"id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
Expand All @@ -390,7 +413,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 18,
"id": "a61bb445-b151-4a2f-8180-d4004c503754",
"metadata": {},
"outputs": [],
Expand All @@ -399,45 +422,69 @@
" raw_text = f.read()"
]
},
{
"cell_type": "markdown",
"id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e",
"metadata": {},
"source": [
"### Original OpenAI GPT-2 tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 19,
"id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4.29 ms ± 46.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit orig_tokenizer.encode(raw_text)"
]
},
{
"cell_type": "markdown",
"id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90",
"metadata": {},
"source": [
"### Tiktoken OpenAI GPT-2 tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 20,
"id": "036dd628-3591-46c9-a5ce-b20b105a8062",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.4 ms ± 9.71 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%timeit tik_tokenizer.encode(raw_text)"
]
},
{
"cell_type": "markdown",
"id": "0c748de8-273e-42df-b078-3a510106da60",
"metadata": {},
"source": [
"### Hugging Face OpenAI GPT-2 tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 21,
"id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
"metadata": {},
"outputs": [
Expand All @@ -452,7 +499,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"8.46 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -462,21 +509,47 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 22,
"id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8.36 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
]
},
{
"cell_type": "markdown",
"id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
"metadata": {},
"source": [
"### My own GPT-2 tokenizer (for educational purposes)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%timeit tokenizer_gpt2.encode(raw_text)"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 0d4967e

Please sign in to comment.