-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathreplace_from_dictionary.py
76 lines (59 loc) · 2.14 KB
/
replace_from_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import collections
import csv
import os
import logging
from flashtext import KeywordProcessor
from .dictionary import MeSH as f_MeSH
class replace_from_dictionary(object):
"""
Replace phrases from an input dictionary. The replacement is done without
regard to case, but punctuation is handled correctly.
The MeSH (Medical Subject Headings) dictionary is built-in.
Example (given the MeSH dictionary):
input: '(11-Dimethylethyl)-4-methoxyphenol is great'
output: 'MeSH_Butylated_Hydroxyanisole is great'
"""
def __init__(self, f_dict=None, prefix="", suffix=""):
"""
Initialize the parser.
Args:
f_dict: filename, location of the replacement dictionary.
prefix: string, text to prefix each replacement.
suffix: string, text to suffix each replacement.
"""
self.logger = logging.getLogger(__name__)
if f_dict is None:
local_path = os.path.dirname(__file__)
f_dict = os.path.join(local_path, f_MeSH)
self.logger.debug("Using default dictionary: %s" % f_dict)
if not os.path.exists(f_dict):
msg = "Can't find dictionary {}".format(f_dict)
self.logger.error(msg)
raise IOError()
self.prefix = prefix
self.suffix = suffix
terms = collections.defaultdict(list)
with open(f_dict) as FIN:
csvfile = csv.DictReader(FIN)
for row in csvfile:
terms[row["replacement"]].append(row["term"])
self.FT = KeywordProcessor()
self.FT.add_keywords_from_dict(terms)
def __call__(self, doc):
"""
Runs the parser.
Args:
text: a document string
Returns:
doc: a document string
"""
keywords = self.FT.extract_keywords(doc, span_info=True)
n = 0
tokens = []
for word, i, j in keywords:
if n < i:
tokens.append(doc[n:i])
tokens.append("".join([self.prefix, word, self.suffix]))
n = j
tokens.append(doc[n : len(doc)])
return "".join(tokens)