From a2b2756b26e6fa8bc391ead59316c2b9cfda2630 Mon Sep 17 00:00:00 2001 From: RiccardoBruzzese <43668563+Reyzenello@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:29:17 +0200 Subject: [PATCH] Updating hyper-parameters on MLP Pytorch I have been playing around with hyper-parameters, and with the current model on mlp_pytorch_kaparthy, I have: Validation Loss: 2.059125 Test Loss: 2.057981416583061 Meanwhile, mlp_pytorch_reyzenello has: Validation Loss: 2.053818 Test Loss: 2.0344724378415515 Maybe that could be a better configuration? What do you think? I have applied random search using 100 configurations to find the best configuration with hyper-parameters that are suitable for that model. If I have made any miscalculations or missed anything, feel free to add a comment. I'm open to feedback. In case of minor changes, I will update the pull request! Here is my Google Colab, which I have used to find out the best configuration (work in progress): https://colab.research.google.com/drive/1R0FF5qlB900CrxuxV_XW89Ds-Pup_S8e#scrollTo=PDzMw_o4IKyo --- mlp_pytorch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlp_pytorch.py b/mlp_pytorch.py index f5ceb21..574a0e4 100644 --- a/mlp_pytorch.py +++ b/mlp_pytorch.py @@ -190,16 +190,16 @@ def sample_discrete(probs, coinf): train_tokens = [char_to_token[c] for c in open('data/train.txt', 'r').read()] # create the model -context_length = 3 # if 3 tokens predict the 4th, this is a 4-gram model -embedding_size = 48 -hidden_size = 512 +context_length = 4 # if 3 tokens predict the 4th, this is a 4-gram model +embedding_size = 64 +hidden_size = 1024 init_rng = RNG(1337) # these two classes both produce the exact same results. One uses nn.Module the other doesn't. model = MLPRaw(vocab_size, context_length, embedding_size, hidden_size, init_rng) # model = MLP(vocab_size, context_length, embedding_size, hidden_size, init_rng) # create the optimizer -learning_rate = 7e-4 +learning_rate = 0.01 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4) # training loop