-
Notifications
You must be signed in to change notification settings - Fork 0
/
ex_ttt_w2v_pre.py
99 lines (78 loc) · 3.33 KB
/
ex_ttt_w2v_pre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
from math import ceil
from skmultiflow.trees import HoeffdingTree
from strlearn.metrics import balanced_accuracy_score as bac
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm
from strlearn.ensembles import KUE, ROSE, NIE
from utils import CDS
from strlearn.metrics import balanced_accuracy_score as bac, recall, precision, specificity, f1_score, geometric_mean_score_1, geometric_mean_score_2
from sklearn.metrics import recall_score, precision_score, balanced_accuracy_score
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import os
from gensim import downloader
os.environ["TOKENIZERS_PARALLELISM"] = "false"
X = np.load("fakeddit_stream/fakeddit_posts.npy", allow_pickle=True)
bias = np.load("fakeddit_stream/fakeddit_posts_y.npy")
# How many classes?
bias_id = 0
print(X.shape)
print(bias.shape)
# Only titles, without timestamp
# Binary problem
stream = X[:, 0]
y = np.array([1,0])[bias[:,bias_id]] if bias_id == 0 else bias[:,bias_id]
chunk_size = 250
# All chunks
n_chunks = ceil(stream.shape[0]/chunk_size)
# Select dummies
classes = np.unique(y)
n_classes = len(classes)
dummies = stream[[np.where(y==label)[0][0] for label in classes]]
metrics=(recall, recall_score, precision, precision_score, specificity, f1_score, geometric_mean_score_1, geometric_mean_score_2, bac, balanced_accuracy_score)
n_estimators = 10
methods = [
HoeffdingTree(split_criterion="hellinger"),
CDS(HoeffdingTree(split_criterion="hellinger"), n_estimators),
NIE(HoeffdingTree(split_criterion="hellinger"), n_estimators),
KUE(HoeffdingTree(split_criterion="hellinger"), n_estimators),
ROSE(HoeffdingTree(split_criterion="hellinger"), n_estimators),
]
# model = SentenceTransformer('all-MiniLM-L6-v2', device="mps").to("mps")
vectors = downloader.load('word2vec-google-news-300')
# METHODS x CHUNKS x METRICS
pca = PCA(100, random_state=1410)
scores = np.zeros((5, n_chunks, 10))
for chunk_id in tqdm(range(n_chunks)):
chunk_X = stream[chunk_id*chunk_size:chunk_id*chunk_size+chunk_size]
chunk_y = y[chunk_id*chunk_size:chunk_id*chunk_size+chunk_size]
if len(np.unique(chunk_y)) != n_classes:
chunk_X[:n_classes] = dummies
chunk_y[:n_classes] = classes
embeddings = []
for text_id, text in enumerate(chunk_X):
words = text.split(" ")
wordvecs = np.zeros((300,len(words)))
for idx, word in enumerate(words):
try:
wordvecs[:, idx] = vectors[word]
except KeyError:
pass
embeddings.append(np.mean(wordvecs, axis=1))
embeddings = np.array(embeddings)
for method_id, method in enumerate(methods):
if chunk_id == 0:
preproc_X = pca.fit_transform(embeddings)
method.fit(preproc_X, chunk_y)
else:
preproc_X = pca.transform(embeddings)
try:
pred = method.predict(preproc_X)
for metric_id, metric in enumerate(metrics):
score = metric(chunk_y, pred)
scores[method_id, chunk_id, metric_id] = score
method.partial_fit(preproc_X, chunk_y)
except:
scores[method_id, chunk_id, metric_id] = np.nan
np.save("results/scores_w2v_pre_pca100", scores)