-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPsych_AI.py
145 lines (123 loc) · 4.75 KB
/
Psych_AI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Import dependencies
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
from statistics import mean, mode
import sys
# Read in the data
data = pd.read_json('../training_data_8.30.json')
outname = sys.argv[1]
# epochs = sys.argv[2]
# learning = sys.argv[3]
print(outname)
# Apply preprocessing: remove lines with no data, standardize, create new dataframe with text and numerical label
data = data[data[outname+'_scaled'] != '.']
scaler = StandardScaler()
data[outname+'_scaled_new'] = scaler.fit_transform(data[outname+'_scaled'].to_numpy().reshape(-1, 1))
data_text = data['text'].astype(str).tolist()
data_labels = data[outname+'_scaled_new'].astype(float).tolist()
# Form into DataFrame
train_data = pd.DataFrame(
{'text': data_text,
'labels': data_labels
})
# Split data into folds
kk=5
kf = KFold(n_splits=kk, shuffle=True, random_state=1234)
count = 0
Results = []
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
for train_index, val_index in kf.split(train_data):
count += 1
print("Fold = " + str(count))
# split data
training = train_data.iloc[train_index]
validation = train_data.iloc[val_index]
# Setup arguments
model_args = ClassificationArgs(sliding_window=True)
model_args.use_early_stopping = True
model_args.early_stopping_metric = "r2"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 5
model_args.num_train_epochs = 30
model_args.learning_rate = 2e-5
model_args.evaluate_during_training = True
model_args.regression = True
model_args.hidden_dropout_prob = 0.2
model_args.train_batch_size = 48 #originally 32
model_args.eval_batch_size = 24 # orginally 16
model_args.evaluate_during_training_silent = True
model_args.evaluate_during_training_steps = 64
#model_args.manual_seed = 4
model_args.max_seq_length = 512
model_args.no_cache = True
model_args.no_save = True
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.gradient_accumulation_steps = 12
model_args.train_custom_parameters_only = False
model_args.save_best_model = True
# added
model_args.fp16 = True
model_args.gpu = 8
# Create a TransformerModel
model = ClassificationModel(
"xlmroberta",
"xlm-roberta-large",
num_labels=1,
args=model_args,
use_cuda=True,
)
# Train the model
# Output predictions seem to be printed each time the model is evaluated during training. I commented out the print line, but don't know if that will make a difference.
model.train_model(
training,
eval_df=validation,
r2=lambda truth, predictions: r2_score(truth, predictions),
mse=lambda truth, predictions: mean_squared_error(truth, predictions),
mae=lambda truth, predictions: mean_absolute_error(truth, predictions),
)
# evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(validation, r2=r2_score, mse=mean_squared_error, mae=mean_absolute_error)
Results.append([result['r2'], result['mse'], result['mae']])
# Calculate average performance across folds
# result averages
total_r2 = 0
total_mse = 0
total_mae = 0
for result in Results:
total_r2 += result[0]
total_mse += result[1]
total_mae += result[2]
print("Final result of K-Fold")
print("r2", total_r2/kk)
print("mse", total_mse/kk)
print("mae", total_mae/kk)
# Apply to test model
data = pd.read_json('../test_data_8.30.json')
# Apply preprocessing: remove lines with no data, standardize, create new dataframe with text and numerical label
test_data = data[data[outname+'_scaled'] != '.']
scaler = StandardScaler()
test_data[outname+'_scaled_new'] = scaler.fit_transform(test_data[outname+'_scaled'].to_numpy().reshape(-1, 1))
test_text = test_data['text'].astype(str).tolist()
test_labels = test_data[outname+'_scaled_new'].astype(float).tolist()
test_data = pd.DataFrame(
{'text': test_text,
'labels': test_labels
})
# Evaluate the Model
result, model_outputs, wrong_predictions = model.eval_model(test_data, r2=r2_score, mse=mean_squared_error, mae=mean_absolute_error)
print("results", result)
# Save the model
model.model.save_pretrained(outname+'model1')
model.tokenizer.save_pretrained(outname+'model1')
model.config.save_pretrained(outname+'model1/')