Skip to content

Commit

Permalink
Working on training loop and debuggin index issue in embeddings
Browse files Browse the repository at this point in the history
  • Loading branch information
jmaczan committed Jun 22, 2024
1 parent 63b69bb commit 5c499a3
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 13 deletions.
Empty file added src/__init__.py
Empty file.
1 change: 0 additions & 1 deletion src/attention_head.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import torch
import torch.nn as nn
from torch.nn import functional as F


class AttentionHead(nn.Module):
Expand Down
36 changes: 36 additions & 0 deletions src/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from transformers import GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from gpt import default_context_window, default_batch_size


class TextDataset(Dataset):
def __init__(self, text, tokenizer, context_window):
self.tokens = tokenizer.encode(text)
self.context_window = context_window

def __len__(self):
return len(self.tokens) - self.context_window

def __getitem__(self, index):
x = self.tokens[index : index + self.context_window]
y = self.tokens[index + 1 : index + self.context_window + 1]
return torch.tensor(x), torch.tensor(y)


def get_tokenizer(model="gpt2"):
tokenizer = GPT2Tokenizer.from_pretrained(model)
tokenizer.pad_token = tokenizer.eos_token
return tokenizer


def get_data_loader(tokenizer, data_path="data/dataset.txt"):

with open(data_path, "r") as file:
text = file.read()

dataset = TextDataset(
text=text, tokenizer=tokenizer, context_window=default_context_window
)

return DataLoader(dataset=dataset, batch_size=default_batch_size, shuffle=True)
5 changes: 3 additions & 2 deletions src/gpt.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import torch
import torch.nn as nn

from src.positional_encoding import PositionalEncoding
from src.transformer_block import TransformerBlock
from positional_encoding import PositionalEncoding
from transformer_block import TransformerBlock

torch.manual_seed(1995)

Expand All @@ -11,6 +11,7 @@
default_vocabulary_size = 300
default_attention_heads_count = 8
default_transformer_blocks_count = 4
default_batch_size = 32


class GPT(nn.Module):
Expand Down
4 changes: 2 additions & 2 deletions src/multi_head_attention.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
import torch.nn as nn

from src.attention_head import AttentionHead
from attention_head import AttentionHead


class MultiHeadAttention(nn.Module):
Expand All @@ -14,7 +14,7 @@ def __init__(self, embeddings_dim, heads_count):
self.heads = nn.ModuleList(
[
AttentionHead(
embeddings_dim=self.single_head_size,
embedding_dim=self.single_head_size,
)
for _ in range(self.heads_count)
]
Expand Down
47 changes: 41 additions & 6 deletions src/train.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,63 @@
import torch
import torch.nn as nn

from src.gpt import GPT
from data_loader import get_data_loader, get_tokenizer
from gpt import (
GPT,
default_context_window,
default_attention_heads_count,
default_batch_size,
default_embedding_dimension,
default_transformer_blocks_count,
default_vocabulary_size,
)


default_num_epochs = 10
default_learning_rate = 0.001


def train(num_epochs=default_num_epochs, lr=default_learning_rate):
model = GPT()
criterion = nn.CrossEntropyLoss()
def train(
num_epochs=default_num_epochs,
lr=default_learning_rate,
vocabulary_size=default_vocabulary_size,
embedding_dimension=default_embedding_dimension,
context_window=default_context_window,
heads_count=default_attention_heads_count,
blocks_count=default_transformer_blocks_count,
):
model = GPT(
vocabulary_size=vocabulary_size,
embedding_dimension=embedding_dimension,
context_window=context_window,
heads_count=heads_count,
blocks_count=blocks_count,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer = get_tokenizer()
data_loader = get_data_loader(tokenizer=tokenizer)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
data_loader = None

for epoch in range(num_epochs):
model.train()
total_loss = 0

for inputs, targets in data_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs.view(-1, vocabulary_size), targets.view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()

return None
print(
f"Epoch {epoch + 1}/{num_epochs}. Total loss: {total_loss}. Loss: {total_loss/len(data_loader)}"
)


if __name__ == "__main__":
Expand Down
3 changes: 1 addition & 2 deletions src/transformer_block.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import torch
import torch.nn as nn

from src.multi_head_attention import MultiHeadAttention
from multi_head_attention import MultiHeadAttention


class TransformerBlock(nn.Module):
Expand Down
Empty file added test/__init__.py
Empty file.

0 comments on commit 5c499a3

Please sign in to comment.