Implementingthe BPE Tokenizer from Scratch (#487)

rasbt · Jan 17, 2025 · 0d4967e · 0d4967e
1 parent 2fef211
commit 0d4967e
Show file tree

Hide file tree

Showing 4 changed files with 1,463 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -102,6 +102,7 @@ Several folders contain optional materials as a bonus for interested readers:
   - [Installing Python Packages and Libraries Used In This Book](setup/02_installing-python-libraries)
   - [Docker Environment Setup Guide](setup/03_optional-docker-environment)
 - **Chapter 2: Working with text data**
+  - [Byte Pair Encoding (BPE) Tokenizer From Scratch](ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb)
   - [Comparing Various Byte Pair Encoding (BPE) Implementations](ch02/02_bonus_bytepair-encoder)
   - [Understanding the Difference Between Embedding Layers and Linear Layers](ch02/03_bonus_embedding-vs-matmul)
   - [Dataloader Intuition with Simple Numbers](ch02/04_bonus_dataloader-intuition)

diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb
@@ -1900,7 +1900,9 @@
    "source": [
     "See the [./dataloader.ipynb](./dataloader.ipynb) code notebook, which is a concise version of the data loader that we implemented in this chapter and will need for training the GPT model in upcoming chapters.\n",
     "\n",
-    "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions."
+    "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions.\n",
+    "\n",
+    "See the [Byte Pair Encoding (BPE) Tokenizer From Scratch](../02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb) notebook if you are interested in learning how the GPT-2 tokenizer can be implemented and trained from scratch."
    ]
   }
  ],

diff --git a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
@@ -67,7 +67,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tiktoken version: 0.5.1\n"
+      "tiktoken version: 0.7.0\n"
      ]
     }
    ],
@@ -180,8 +180,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Fetching encoder.json: 1.04Mit [00:00, 3.14Mit/s]                                                   \n",
-      "Fetching vocab.bpe: 457kit [00:00, 1.67Mit/s]                                                       \n"
+      "Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s]                                                   \n",
+      "Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s]                                                       \n"
      ]
     }
    ],
@@ -259,7 +259,7 @@
     {
      "data": {
       "text/plain": [
-       "'4.34.0'"
+       "'4.48.0'"
       ]
      },
      "execution_count": 12,
@@ -278,78 +278,7 @@
    "execution_count": 13,
    "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e4df871bb797435787143a3abe6b0231",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f11b27a4aabf43af9bf57f929683def6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d3aa9a24aacc43108ef2ed72e7bacd33",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f9341bc23b594bb68dcf8954bff6d9bd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c5f55f2f1dbc4152acc9b2061167ee0a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from transformers import GPT2Tokenizer\n",
     "\n",
@@ -377,6 +306,100 @@
     "hf_tokenizer(strings)[\"input_ids\"]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "&nbsp;\n",
+    "\n",
+    "## Using my own from-scratch BPE tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import io\n",
+    "import nbformat\n",
+    "import types\n",
+    "\n",
+    "def import_from_notebook():\n",
+    "    def import_definitions_from_notebook(fullname, names):\n",
+    "        current_dir = os.getcwd()\n",
+    "        path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n",
+    "        path = os.path.normpath(path)\n",
+    "\n",
+    "        # Load the notebook\n",
+    "        if not os.path.exists(path):\n",
+    "            raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n",
+    "\n",
+    "        with io.open(path, \"r\", encoding=\"utf-8\") as f:\n",
+    "            nb = nbformat.read(f, as_version=4)\n",
+    "\n",
+    "        # Create a module to store the imported functions and classes\n",
+    "        mod = types.ModuleType(fullname)\n",
+    "        sys.modules[fullname] = mod\n",
+    "\n",
+    "        # Go through the notebook cells and only execute function or class definitions\n",
+    "        for cell in nb.cells:\n",
+    "            if cell.cell_type == \"code\":\n",
+    "                cell_code = cell.source\n",
+    "                for name in names:\n",
+    "                    # Check for function or class definitions\n",
+    "                    if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n",
+    "                        exec(cell_code, mod.__dict__)\n",
+    "        return mod\n",
+    "\n",
+    "    fullname = \"bpe-from-scratch\"\n",
+    "    names = [\"BPETokenizerSimple\"]\n",
+    "\n",
+    "    return import_definitions_from_notebook(fullname, names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imported_module = import_from_notebook()\n",
+    "BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n",
+    "\n",
+    "tokenizer_gpt2 = BPETokenizerSimple()\n",
+    "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
+    "    vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n",
+    "    bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
+     ]
+    }
+   ],
+   "source": [
+    "integers = tokenizer_gpt2.encode(text)\n",
+    "\n",
+    "print(integers)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
@@ -390,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 18,
    "id": "a61bb445-b151-4a2f-8180-d4004c503754",
    "metadata": {},
    "outputs": [],
@@ -399,45 +422,69 @@
     "    raw_text = f.read()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e",
+   "metadata": {},
+   "source": [
+    "### Original OpenAI GPT-2 tokenizer"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 19,
    "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "4.29 ms ± 46.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
    "source": [
     "%timeit orig_tokenizer.encode(raw_text)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90",
+   "metadata": {},
+   "source": [
+    "### Tiktoken OpenAI GPT-2 tokenizer"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 20,
    "id": "036dd628-3591-46c9-a5ce-b20b105a8062",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.4 ms ± 9.71 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+      "1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
      ]
     }
    ],
    "source": [
     "%timeit tik_tokenizer.encode(raw_text)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0c748de8-273e-42df-b078-3a510106da60",
+   "metadata": {},
+   "source": [
+    "### Hugging Face OpenAI GPT-2 tokenizer"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 21,
    "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
    "metadata": {},
    "outputs": [
@@ -452,7 +499,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "8.46 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -462,21 +509,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 22,
    "id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "8.36 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
    "source": [
     "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
+   "metadata": {},
+   "source": [
+    "### My own GPT-2 tokenizer (for educational purposes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit tokenizer_gpt2.encode(raw_text)"
+   ]
   }
  ],
  "metadata": {