-
Notifications
You must be signed in to change notification settings - Fork 0
/
checklist.py
131 lines (94 loc) · 2.84 KB
/
checklist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import re
import sys
def abbreviation_to_raw(abbrev: str) -> str:
return r'\.'.join(abbrev.split("."))
titles = tuple(map(abbreviation_to_raw, (
"Mr.",
"MR.",
"a.m.",
"p.m.",
"Dr.",
"v.",
"Ms.",
"MS.",
"Mrs.",
"MRS.",
"Jr.",
"JR.",
"U.S.",
)))
substitutes_dict = {
'cuz': 'because',
'wanna': 'want to',
'kinda': 'kind of',
'Kinda': 'Kind of',
'lemme': 'let me',
'Lemme': 'Let me',
'gimme': 'give me',
'gonna': 'going to',
'Gimme': 'Give me',
'alright': 'all right',
'video conference': 'videoconference',
'Video conference': 'Videoconference'
}
parenthetical = (
'Defendant',
'Plaintiff',
'Off the record',
'The reporter read back',
'The record was replayed',
'Videoconference connection',
'Witness sworn'
)
def substitute_qa(text: str) -> str:
return re.sub(r"(\n([QA])\. )|(\n([QA]):\n)", r"\n\g<2>\g<4>.\t", text)
def substitute_colloquy(text: str) -> str:
return re.sub(r"(?<!testified as follows)(:\n|: )(?=[^Q][^.])", r": ", text)
def substitute_by_line_colon(text: str) -> str:
return re.sub(r"(?<=\n)(by|BY.+)\n", r"\g<1>:\n", text)
def substitute_punctuation_one_space(text: str) -> str:
return re.sub(r"([?.]\"?) ", r"\g<1> ", text)
def substitute_title_abbreviations(text: str) -> str:
t = r"|".join(titles)
return re.sub(fr'\b({t}) ', r'\g<1> ', text)
def substitute_strike_that(text: str) -> str:
return re.sub(r"([sS]trike that. )", r"\g<1>\n\t\t", text)
def substitute_double_colon(text: str) -> str:
return text.replace("::", ":")
def format_tabs(text: str) -> str:
text = re.sub(r"\n([QA].)", r"\n\t\g<1>", text)
text = re.sub(r"\n?(.+: )", r"\n\t\t\g<1>", text).lstrip('\n')
return text
def format_parentheticals(text: str) -> str:
t = r"|".join(parenthetical)
return re.sub(fr'\(({t})', r'\t\t(\g<1>', text)
def substitute_new_speaker(text: str):
return text.replace("New Speaker:\n", "")
def substitute_words(text: str) -> str:
for old, new in substitutes_dict.items():
text = re.sub(fr'\b{old}\b', fr'{new}', text)
return text
def perform_checks(file_name, mutators) -> str:
with open(file_name, 'r') as f:
text = f.read()
for mutator in mutators:
text = mutator(text)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Please provide a file")
exit(1)
file = sys.argv[1]
reformatters = (
substitute_qa,
substitute_colloquy,
substitute_by_line_colon,
substitute_punctuation_one_space,
substitute_title_abbreviations,
substitute_strike_that,
substitute_double_colon,
format_tabs,
format_parentheticals,
substitute_new_speaker,
substitute_words,
)
perform_checks(file, reformatters)