-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean-data.py
49 lines (36 loc) · 1.46 KB
/
clean-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""Cleans test sets in format 'sent \t labels...', outputting cleaned sentences to stdout
author: laurie
"""
import argparse
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer
from tools.defines import Patterns
from tools.remove_non_printing_char import get_replacer as non_printing_char_replacer
from tools.demojizier import Demojizer
class SentenceClean:
def __init__(self):
self.mpn = MosesPunctNormalizer(lang='en')
self.replace_nonprint = non_printing_char_replacer(" ")
self.demojiser = Demojizer()
def __call__(self, line):
clean = self.mpn.normalize(line)
clean = self.replace_nonprint(clean)
clean = unicodedata.normalize("NFKC", clean)
clean = self.demojiser(clean, "")
# remove twitter effects
clean = Patterns.URL_PATTERN.sub('', clean)
clean = Patterns.HASHTAG_PATTERN.sub('', clean)
clean = Patterns.MENTION_PATTERN.sub('', clean)
clean = Patterns.RESERVED_WORDS_PATTERN.sub('', clean)
clean = Patterns.NUMBERS_PATTERN.sub('', clean)
return clean
parser = argparse.ArgumentParser()
parser.add_argument("in_file", help="test file in tab-separated format 'sent label1 label2")
args = parser.parse_args()
sent_cleaner = SentenceClean()
with open(args.in_file) as f:
for line in f.readlines():
raw_sent = line.split('\t')[0].strip()
clean_sent = sent_cleaner(raw_sent)
sys.stdout.write(f"{clean_sent}\n")