-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
158 lines (129 loc) · 4.66 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
from transformers import (
AutoTokenizer,
DistilBertForSequenceClassification,
TrainingArguments,
Trainer,
)
from datasets import Dataset
import pandas as pd
import torch
MODEL_DIR = "./saved_model" # Directory where the model will be saved
BASE_MODEL = "distilbert-base-uncased"
CSV_DATA = "all-data.csv" # From kaggle
OUTPUT_DIR = "./results"
LOGGING_DIR = "./logs"
def main():
if not os.path.isfile(CSV_DATA):
print("Training data not found, exiting...")
return
# Use the model for inference
device = (
torch.device("mps")
if torch.backends.mps.is_available()
else torch.device("cpu")
)
# Load CSV data into a pandas DataFrame
data = pd.read_csv(
CSV_DATA, delimiter=",", encoding="latin-1", names=["sentiment", "title"]
)
print(data.columns)
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
categorical_sentiment = pd.Categorical(data["sentiment"])
# Get the categories (unique values in the sentiment column)
categories = categorical_sentiment.categories
print("Categories (Labels):", categories)
# Get the corresponding integer codes
codes = categorical_sentiment.codes
mapping = dict(zip(categories, range(len(categories))))
print("Category-to-Code Mapping:", mapping)
labels = pd.Categorical(
data["sentiment"]
).codes # Convert sentiment to integer codes
# Define a preprocessing function
def preprocess_function(examples):
return tokenizer(
examples, # Apply the tokenizer on the title (your text data)
truncation=True,
padding="max_length",
max_length=128,
)
# Apply the tokenizer to your text data
tokenized_data = preprocess_function(data["title"].tolist())
# Convert to Hugging Face Dataset format and add the labels
dataset = Dataset.from_dict(
{
"input_ids": tokenized_data["input_ids"],
"attention_mask": tokenized_data["attention_mask"],
"labels": labels,
}
)
# Perform train-test split
train_testsplit = dataset.train_test_split(
test_size=0.2, seed=42
) # 80% train, 20% test
train_dataset = train_testsplit["train"]
eval_dataset = train_testsplit["test"]
# Check if the model directory exists
if os.path.exists(MODEL_DIR):
print(f"Loading model from {MODEL_DIR}...")
model = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
else:
print(f"Training a new model and saving it to {MODEL_DIR}...")
# Load the DistilBERT model for sequence classification with the appropriate number of labels
model = DistilBertForSequenceClassification.from_pretrained(
BASE_MODEL, num_labels=len(set(labels))
)
# Ensure the model is moved to the correct device
model.to(device)
# Define training arguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
gradient_accumulation_steps=4,
num_train_epochs=20,
logging_dir=LOGGING_DIR,
logging_steps=10,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=3e-4,
lr_scheduler_type="linear",
use_mps_device=True,
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset, # Pass the training dataset
eval_dataset=eval_dataset, # Pass the evaluation dataset
)
trainer.train()
# Evaluate the model (optional)
print("Evaluating the model...")
result = trainer.evaluate() # Uncomment if you want to perform evaluation
print(result)
# Example sentence for inference
example_sentence = "OpenAI Just Gave Spectacular News to Nvidia Stock Investors"
# Tokenize the example sentence
inputs = tokenizer(
example_sentence,
return_tensors="pt", # Use pytorch tensors
truncation=True,
padding=True,
max_length=128,
)
# Move the input tensors to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}
# Inference (ensure no gradients are calculated)
with torch.no_grad():
outputs = model(**inputs)
# Extract logits and make predictions
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
# Print the predicted label
print(f"Prediction for the sentence: {predictions.item()}")
if __name__ == "__main__":
main()