-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_translation_words.py
108 lines (92 loc) · 2.88 KB
/
find_translation_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
from collections import Counter
import re
from en_function_words import FUNCTION_WORDS
import operator
from nltk.corpus import words
from string import punctuation
regex = re.compile('[%s]' % re.escape(punctuation))
common_words = set(words.words())
FUNCTION_WORDS = set(FUNCTION_WORDS)
"""
A script to identify words that would indicate that a post is a translation post
"""
def clean_string(string):
"""
Helper function to clean a string and tokenize it
:param string: str
The str to clean by removing punctuation
:return: list
Cleaned list of tokens to be returned
"""
global regex
string = string.lower()
string = re.sub('&.*;', ' ', string)
string = regex.sub("", string)
string_list = string.split()
return string_list
def count_words(lst):
"""
Get word count from list of posts
:param lst: list or array
:return: Counter
"""
word_counter = Counter()
for post in lst:
string_list = clean_string(post)
final_words = filter_words(string_list)
word_counter.update(final_words)
return word_counter
def compute_probability(word, joined_count, counter2):
"""
Compute conditional probability given
:param joined_count:
:param word:
:param counter2:
:return:
"""
if word in counter2:
pwt = joined_count[word] / len(joined_count)
pw = counter2[word] / len(counter2)
return pwt / pw
else:
pwt = joined_count[word] / len(joined_count)
return pwt
def filter_words(word_list):
"""
Filter out certain words
:param word_list: list
:return: list
"""
final_words = []
for word in word_list:
if word in FUNCTION_WORDS:
continue
elif len(word) < 3:
continue
elif word not in common_words:
continue
else:
final_words.append(word)
return final_words
def main_program(translation_file, cs_post_file):
p_wt = {}
translation_posts = pd.read_csv(translation_file, encoding='utf-8')
cs_posts = pd.read_csv(cs_post_file, encoding="utf-8")
translation_lst = translation_posts["Text"].values
cs_lst = cs_posts["Text"].values
cs_count = count_words(cs_lst)
translation_count = count_words(translation_lst)
for word in translation_count:
p_wt[word] = compute_probability(word, translation_count, cs_count)
sorted_probs = sorted(p_T.items(), reverse=True, key=operator.itemgetter(1))
final_translation_words = sorted_probs[:50]
with open('translation_prob.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
for key, value in final_translation_words:
writer.writerow([key, value])
writer.close()
if __name__ == "__main__":
translation_loc = sys.argv[1]
cs_post_loc = sys.argv[2]
main_program(translation_loc, cs_post_loc)