-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_calls.py
124 lines (96 loc) · 4.52 KB
/
feature_calls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from functions import *
from feature_extraction import *
from ngrams import *
from POS import pos_tag, skipgramming
from scipy import spatial
from sklearn.metrics.pairwise import cosine_distances
import textstat #pip install textstat
import pandas as pd
import time
import readability
from multiprocessing import Pool
import psutil
import pickle
#%%
raw_data = read_data()
# df = textimport_light(raw_data, pandas_check = True)
df = textimport(raw_data, pandas_check = True)
text_IDs, text_uniques = remove_duplicates(df)
df['text_id'] = pd.Series(zip(text_IDs[0::2], text_IDs[1::2]))
#%%
start = time.time()
# batch_size = 100
# corpora = preprocessing_complete(text_uniques[0:batch_size])
corpora = preprocessing_complete(text_uniques)
with open('data/pos_tags_whole_text_4536.pkl', 'rb') as f:
pos = pickle.load(f)
# num_pairs = int(batch_size/2)
num_pairs = len(raw_data)
end = time.time()
print(f"Execution time was {end-start}s")
#%%
def calls(corpora): #9.5s / 100 texts
start = time.time()
# corpora[0] = no preprocessing
asl = avg_sentence_length(corpora[0]) #0.5s
# corpora[1] = punctuation removed
awl = avg_word_length(corpora[1]) #0.07s
fwf, fws = function_words(corpora[1]) #0.8s
hl = hapax_legomena(corpora[1]) #0.5s
liwc = LIWC(corpora[1]) #1.9s
fl_kinc, avg_syll = readability_metrics(corpora[1]) #1.45s
ttr = TTR(corpora[1]) #0.1s
w_tg = tfidf_word_ngrams(corpora[1], 3,3, num_pairs)
c_tg = tfidf_char_ngrams(corpora[1], 3,3, num_pairs)
i_o_c = index_of_coincidence(corpora[1]) #0.06s
# corpora[2] = stopwords removed
pr = punctuation_ratio(corpora[2]) #0.3s
sc = special_characters(corpora[2]) #0.2s
# corpora[3] = both removed
iw = intensifier_words(corpora[3]) #0.08s
ta = time_adverbs(corpora[3]) #0.09s
dgt = digits(corpora[3])
#pos_tags as input
startSkip = time.time()
st = skipgramming(pos, num_pairs, False)
endSkip = time.time()
print(f"Execution time was {endSkip-startSkip}s")
adj_adv = adj_adv_ratio(pos)
si_t = simple_tense(pos)
end = time.time()
print(f"Execution time was {end-start}s")
#returns one list for float output, one for lists, and one where the output is already sim or cos
return list(zip(*(asl, awl, fws, hl, ttr, pr, sc, dgt, adj_adv, i_o_c, fl_kinc, avg_syll))), list(zip(*(liwc, fwf, iw, ta, si_t))), list(zip(*(w_tg, c_tg, st)))
#%% 26.5s for 100 texts...
def arrays_combined(corpora):
floats, arrays, sims = calls(corpora)
floats_matrix = np.array([float_ for float_ in floats], dtype=object)
tagged_pairs_one = floats_matrix[np.array(list(df['text_id'][:num_pairs]))]
distance_matrix = np.stack([dist(pair[0], pair[1]) for pair in tagged_pairs_one])
array_matrix = np.array([array_ for array_ in arrays], dtype = object)
tagged_pairs_two = array_matrix[np.array(list(df['text_id'][:num_pairs]))]
tagged_pairs_split = [p for p in tagged_pairs_two]
# missing index table!
cos = []
for j, pair in enumerate(tagged_pairs_split):
featuresFirst = [feature for feature in pair[0]]
featuresSecond = [feature for feature in pair[1]]
cosTemp = []
for i, feature in enumerate(pair[0]):
feature_vector_1 = np.array(featuresFirst[i]).reshape(1,-1)
feature_vector_2 = np.array(featuresSecond[i]).reshape(1,-1)
cosTemp.append(float(cosine(feature_vector_1, feature_vector_2)))
cos.append(cosTemp)
# cos_matrix = np.stack(np.array_split(np.array((cos)), num_pairs*2))
# cos_matrix = np.array_split(np.array((cos)), num_pairs-1)
sim_matrix = np.stack(sims)
feature_matrix = np.hstack((distance_matrix, cos, sim_matrix))
return feature_matrix #feature_matrix, cos, cos_matrix
# return distance_matrix, cos, sim_matrix
#%%
feature_matrix = arrays_combined(corpora)
# distance_matrix, cos, sim_matrix = arrays_combined(corpora)
# feature_matrix, cos, cos_matrix = arrays_combined(corpora)
#%%
with open('feature_matrix.pkl', 'wb') as f:
pickle.dump(feature_matrix,f)