In the early training stages, the example generation may produce out-…

…of-bound tokens; make sure tiktoken doesn't see them.
karpathy · Aug 26, 2024 · 301de76 · 301de76
1 parent 6104ab1
commit 301de76
Showing 1 changed file with 5 additions and 0 deletions.
diff --git a/train_gpt2.py b/train_gpt2.py
@@ -473,6 +473,11 @@ def get_lr(it):
                 xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
                 # append to the sequence
                 xgen = torch.cat((xgen, xcol), dim=1)
+
+        # The model may generate a token id that is out of bounds, making tiktoken panic
+        # in decode(..). A quick fix is to replace them with the EOT.
+        xgen[xgen >= enc.max_token_value + 1] = enc.eot_token
+
         # print the generated text
         for i in range(num_return_sequences):
             tokens = xgen[i, :max_length].tolist()