diff --git a/mlp_pytorch.py b/mlp_pytorch.py index f5ceb21..574a0e4 100644 --- a/mlp_pytorch.py +++ b/mlp_pytorch.py @@ -190,16 +190,16 @@ def sample_discrete(probs, coinf): train_tokens = [char_to_token[c] for c in open('data/train.txt', 'r').read()] # create the model -context_length = 3 # if 3 tokens predict the 4th, this is a 4-gram model -embedding_size = 48 -hidden_size = 512 +context_length = 4 # if 3 tokens predict the 4th, this is a 4-gram model +embedding_size = 64 +hidden_size = 1024 init_rng = RNG(1337) # these two classes both produce the exact same results. One uses nn.Module the other doesn't. model = MLPRaw(vocab_size, context_length, embedding_size, hidden_size, init_rng) # model = MLP(vocab_size, context_length, embedding_size, hidden_size, init_rng) # create the optimizer -learning_rate = 7e-4 +learning_rate = 0.01 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4) # training loop