From a2b2756b26e6fa8bc391ead59316c2b9cfda2630 Mon Sep 17 00:00:00 2001
From: RiccardoBruzzese <43668563+Reyzenello@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:29:17 +0200
Subject: [PATCH] Updating hyper-parameters on MLP Pytorch

I have been playing around with hyper-parameters, and with the current model on mlp_pytorch_kaparthy, I have:

Validation Loss: 2.059125
Test Loss: 2.057981416583061
Meanwhile, mlp_pytorch_reyzenello has:

Validation Loss: 2.053818
Test Loss: 2.0344724378415515
Maybe that could be a better configuration? What do you think? I have applied random search using 100 configurations to find the best configuration with hyper-parameters that are suitable for that model. If I have made any miscalculations or missed anything, feel free to add a comment. I'm open to feedback. In case of minor changes, I will update the pull request!

Here is my Google Colab, which I have used to find out the best configuration (work in progress):
https://colab.research.google.com/drive/1R0FF5qlB900CrxuxV_XW89Ds-Pup_S8e#scrollTo=PDzMw_o4IKyo
---
 mlp_pytorch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlp_pytorch.py b/mlp_pytorch.py
index f5ceb21..574a0e4 100644
--- a/mlp_pytorch.py
+++ b/mlp_pytorch.py
@@ -190,16 +190,16 @@ def sample_discrete(probs, coinf):
 train_tokens = [char_to_token[c] for c in open('data/train.txt', 'r').read()]
 
 # create the model
-context_length = 3 # if 3 tokens predict the 4th, this is a 4-gram model
-embedding_size = 48
-hidden_size = 512
+context_length = 4 # if 3 tokens predict the 4th, this is a 4-gram model
+embedding_size = 64
+hidden_size = 1024
 init_rng = RNG(1337)
 # these two classes both produce the exact same results. One uses nn.Module the other doesn't.
 model = MLPRaw(vocab_size, context_length, embedding_size, hidden_size, init_rng)
 # model = MLP(vocab_size, context_length, embedding_size, hidden_size, init_rng)
 
 # create the optimizer
-learning_rate = 7e-4
+learning_rate = 0.01
 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
 
 # training loop