This repository has been archived by the owner on Aug 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 45
/
social_data_preprocess.py
99 lines (81 loc) · 3.08 KB
/
social_data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import sys
import emoji
import regex as re
from nlp_utils import convert_window1252_to_utf8
BREAK_LINE = "---END_COMMENT---"
teencode_dict = {}
for line in open('resources/teencode').read().splitlines():
parts = line.split('\t')
if len(parts) == 2:
if parts[0] in teencode_dict:
print('WARN: Found duplicate key {}'.format(parts[0]))
teencode_dict[parts[0]] = parts[1]
def preprocess_sentence(sentence):
# convert to utf8 encoding
sentence = convert_window1252_to_utf8(sentence)
# remove inside () and brackets
sentence = re.sub(r'\([^\)]*\)', '', sentence)
# teencode
sentence = replace_teencode(sentence)
sentence = re.sub(r'^\p{P}+', '', re.sub(r'\p{P}+$', '', sentence))
sentence = re.sub(r'\s+', ' ', sentence).strip()
return sentence
def replace_teencode(sentence):
# NOTE: keep punct position
words = sentence.split()
for iw, word in enumerate(words):
word_parts = re.split('#{3}', re.sub(r'^(\p{P}*)', r'\1###', re.sub(r'(\p{P}*)$', r'###\1', word)))
if word_parts[1] in teencode_dict:
word_parts[1] = teencode_dict.get(word_parts[1])
words[iw] = ''.join(word_parts)
return ' '.join(words)
def replace_special_char(text):
text = text.replace('ð', 'đ')
text = text.replace(' ?& ?', ' và ')
text = re.sub(r'[“”"\'\)\(\[\]\{\}]', '', text)
text = re.sub(r'<U\+200B>', ' ', text)
# remove zero width space
text = re.sub('', ' ', text)
return text
def preprocess_text(text):
text = replace_special_char(text)
text = re.sub('\n+', '\n', text)
text = remove_emoij(text)
# remove url, email
text = re.sub(r'https?://[^ ]+', '<url>', text)
text = re.sub(r'[a-zA-Z][\w.]+@[A-Za-z0-9]+\.[^ ]+', '<email>', text)
# ...
text = re.sub(r'(\p{L}+)\.{3} ?', r'\1 ', text)
text = re.sub(r'… ?', r' ', text)
return text
def remove_emoij(text):
allchars = [char for char in text]
emoji_list = [char for char in allchars if char in emoji.UNICODE_EMOJI]
clean_text = ' '.join([word for word in text.split() if not any(char in word for char in emoji_list)])
return clean_text
def to_sentences(text):
text = preprocess_text(text)
# split
text = re.sub(r'(\n|[\.!?;] )', r'\1 <newline>', text)
return [x.strip() for x in re.split(r' <newline>', text) if x.strip()]
if __name__ == '__main__':
inp = sys.argv[1]
out = inp + ".out"
print('INP: {}\nOUT: {}'.format(inp, out))
text = ""
current_line = 0
with open(out, 'w') as fp:
for line in open(inp):
if line.startswith(BREAK_LINE):
current_line += 1
if current_line % 1000 == 0:
print('INFO Current line is {}'.format(current_line))
sentences = to_sentences(text)
text = ""
for sentence in sentences:
sentence = preprocess_sentence(sentence)
if len(sentence) > 10:
fp.write(sentence + '\n')
else:
text += line
print('Done.')