-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse-pos.js
155 lines (126 loc) · 3.49 KB
/
parse-pos.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
const R = require("ramda");
const fs = require("fs");
const path = require("path");
const { promisify } = require("util");
const raw_data = require("./data/raw.json");
const pos = require("pos");
// // How many tweets?
// // 42,932
// console.log(raw_data.length.toLocaleString());
// // THAT IS ALOT!
// console.dir(first5);
// console.log(
// R.filter(
// R.compose(R.not, R.propSatisfies(R.contains("https://t.co/"), "text")),
// raw_data
// ).length.toLocaleString()
// );
const actual_tweets = R.filter(
R.compose(R.not, R.propSatisfies(R.contains("https://t.co/"), "text")),
raw_data
);
const text = R.map(R.prop("text"), actual_tweets);
const actual_tweet_text = R.filter(val => R.indexOf("RT", val) !== 0, text);
// given the amount, create ngram
// from a given list
//
// assumes list is what we expect
// either letters, words, phrases, etc
const makeNgram = (n, list, sep = " ") => {
if (!n || n < 1) {
throw new Error("You must give a number higher than 0");
}
if (!list) {
throw new Error("You must give me a list of things to gram-ify");
}
return list.reduce((a, c, i, arr) => {
// If we are at the end, we won't get
// all the needed values
const possibleNextValues = arr.slice(i + 1, i + n);
// so we can fill it in
const nextValues = Array.from(
{ length: n - 1 },
(_, j) => possibleNextValues[j] || "__EMPTY__"
);
const gram = [c, ...nextValues];
return [...a, gram.join(sep)];
}, []);
};
const pos_words = R.map(tweet => {
var words = new pos.Lexer().lex(tweet);
var tagger = new pos.Tagger();
var taggedWords = tagger.tag(words);
return taggedWords.map(([_, pos]) => pos);
}, actual_tweet_text);
const listOGrams = R.map(list => makeNgram(1, list), pos_words);
const get_suffix_count = grams => {
const cache = {};
for (let i = 0; i < grams.length; i++) {
const gram = grams[i];
const nextItem = grams[i + 1] ? grams[i + 1] : "__EMPTY__";
if (cache[gram]) {
if (cache[gram][nextItem]) {
cache[gram][nextItem]++;
continue;
}
cache[gram] = {
...cache[gram],
[nextItem]: 1
};
continue;
}
cache[gram] = {
[nextItem]: 1
};
}
return cache;
};
const get_frequency_gram = grams => {
const cache = {};
for (let i = 0; i < grams.length; i++) {
const gram = grams[i].toLowerCase();
cache[gram] = (cache[gram] || 0) + 1;
}
return cache;
};
const frequencies = R.map(get_frequency_gram, listOGrams);
const counts = R.map(get_suffix_count, listOGrams);
const reduceCounts = R.reduce((a, c) => {
for (const [prefix, counts] of Object.entries(c)) {
if (!(prefix in a)) {
a[prefix] = {};
}
for (const [suffix, count] of Object.entries(counts)) {
if (suffix in a[prefix]) {
a[prefix][suffix] = count + a[prefix][suffix];
} else {
a[prefix][suffix] = count;
}
}
}
return a;
}, {});
const reduceFreq = R.reduce((a, c) => {
for (const [key, count] of Object.entries(c)) {
if (key in a) {
a[key] = count + a[key];
} else {
a[key] = count;
}
}
return a;
}, {});
const freq_hash = reduceFreq(frequencies);
const counts_hash = reduceCounts(counts);
const writeFile = promisify(fs.writeFile);
writeFile(
path.resolve(__dirname, "data", "freq_pos.json"),
JSON.stringify(freq_hash, null, 2)
)
.then(() =>
writeFile(
path.resolve(__dirname, "data", "count_pos.json"),
JSON.stringify(counts_hash, null, 2)
)
)
.then(console.log);