-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_combined_dataset.py
62 lines (54 loc) · 2.57 KB
/
create_combined_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import codecs
import json
import glob
from datetime import datetime
import argparse
from tokenizer import word_tokenize, detokenize
def create_json(input_folder, input_summaries, output_folder):
for filename in os.listdir(input_folder):
d = None
with codecs.open(input_folder+filename) as json_data:
d = json.load(json_data)
print('filename',input_folder+filename)
output = []
for entry in d:
datetime_object = datetime.strptime(entry['day'], '%m_%d_%y')
html_file_name = []
html_file_name.append(datetime_object.strftime("%Y%m%d"))
visname_homename = entry['vis_name'].replace(" ", "_") + "-" + entry['home_name'].replace(" ", "_")
visname_homename = visname_homename.replace('D-backs', 'Diamondbacks')
html_file_name.append(visname_homename)
html_file_name.append(str(entry['vis_line']['team_runs']) + "-" + str(entry['home_line']['team_runs']))
files = glob.glob(input_summaries+"*" +"_".join(html_file_name))
if len(files) < 1:
print(input_summaries+"*"+"_".join(html_file_name) + " not found")
elif len(files) > 1:
print(input_summaries + "*" + "_".join(html_file_name) + " multiple found")
else:
fname = files[0]
with codecs.open(fname, encoding='utf-8') as f:
content = f.readlines()
updated_content = []
for line in content:
words = word_tokenize(detokenize(line.strip().split()))
updated_content.append(" ".join(words))
text = " *NEWPARAGRAPH* ".join(updated_content)
entry['summary'] = text.split()
output.append(entry)
if len(output) > 0:
with codecs.open(output_folder+'combined_'+filename, 'w+') as outfile:
json.dump(output, outfile)
outfile.close()
parser = argparse.ArgumentParser(description='Combining box/line/play-by-play with summaries')
parser.add_argument('-input_folder',type=str,
help='input folder containg box and line score stats')
parser.add_argument('-input_summaries',type=str,
help='input folder containing summaries')
parser.add_argument('-output_folder',type=str,
help='output folder')
args = parser.parse_args()
input_folder = args.input_folder
input_summaries = args.input_summaries
output_folder = args.output_folder
create_json(input_folder, input_summaries, output_folder)