-
Notifications
You must be signed in to change notification settings - Fork 0
/
bart.py
73 lines (56 loc) · 2.06 KB
/
bart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Gerekli kütüphaneleri yükle
!pip install transformers
!pip install torch
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
# Veri setini yükle ve ön işle
df = pd.read_csv("yol/dosya_adi.csv") # Veri setinin yolu
# DataFrame örneğini göster
print(df.head())
# Tokenizer ve modeli yükle
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
def preprocess_data(examples):
inputs = [code1 for code1 in examples['code1']]
targets = [code2 for code2 in examples['code2']]
model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
# Hedefleri tokenize et
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# DataFrame'i dictionary'e dönüştür ve veriyi hazırla
dataset = df.to_dict('list')
dataset = preprocess_data(dataset)
# PyTorch Dataset sınıfını tanımla
class CodeDataset(Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
def __len__(self):
return len(self.encodings['input_ids'])
train_dataset = CodeDataset(dataset)
# Modeli yükle ve fine-tuning için ayarları yap
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
# eval_dataset=val_dataset, # Değerlendirme seti varsa eklenebilir
)
# Eğitimi başlat
trainer.train()
# Modeli kaydet
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")