-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus_preprocessor.py
40 lines (31 loc) · 1.46 KB
/
corpus_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from pandas import DataFrame
from data.corpus_handler import CorpusName
from data.file_handler import add_and_get_abs_path, save_df
from data.file_name_generator import gen_sentences_file_name, \
gen_tagged_tokens_file_name
STD_SENSE = '_SENSE'
class CorpusPreprocessor(object):
def __init__(self, corpus_name: CorpusName, corpus_cache_path: str):
self.corpus_name = corpus_name
self.corpus_cache_path = add_and_get_abs_path(corpus_cache_path)
def get_sentences(self) -> DataFrame:
""" Returns a DataFrame with a column 'sentence', where each sentence is
a list of tokens. """
pass
def get_tagged_tokens(self) -> DataFrame:
""" Returns a DataFrame with the columns 'token' and 'sense'. """
pass
def cache_dataset(self) -> None:
""" Saves raw tokenized sentences for training and all tokens with their
corresponding senses for evaluation at 'corpus_cache_path'. """
sentences = self.get_sentences()
tagged_tokens = self.get_tagged_tokens()
assert 'sentence' in sentences.columns
assert 'token' in tagged_tokens.columns
assert 'sense' in tagged_tokens.columns
assert 'tagged_sense' in tagged_tokens.columns
save_df(self.corpus_cache_path,
gen_sentences_file_name(self.corpus_name), sentences)
save_df(self.corpus_cache_path,
gen_tagged_tokens_file_name(self.corpus_name),
tagged_tokens)