-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_submission_data.py
134 lines (114 loc) · 4.63 KB
/
generate_submission_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import csv
import pandas as pd
import re
# ### Process
#
# 1. load `csv` layout file
# 2. Parse algorithm:
# - If DayOfWeek and "Sesson" are included in title:
# - log DayOfWeek
# - log session ID: 1, 2, 3 or 1a, 2b, 3a etc
# - log submisssion id
# 3. load `csv` latest submissions file
#
#
# Assuptions:
# - Session 1: 800 - 1005
# - Session 2: 1025 - 1230
# - Session 3: 1630 - 1835
# - Continental breakfast before session 1
# - Coffee break after session 1
# - Lunch after session 2
# - Coffee break before session 3
# - Dinner after session 3 (if no banquet)
#
# Evening Events:
# - Banquet: define time
# - tutorials: define time
def generate(layoutfile='./data-not-in-version-control/Copper 2017_data - Submissions.csv',
latestdatafile='./data-not-in-version-control/Copper 2017_data_2017-02-06.xlsx'):
data = pd.read_csv(layoutfile, encoding='latin1')
program = {}
daysofweek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday']
gettitle = False
for idx, row in data.iterrows():
if 'END OF CONFERENCE' in row['title']:
break
day = [d for d in daysofweek if d+' Session' in row['title']]
if day:
# session row
currentday, _, sessionnumfull = row['title'].split()
print('\n', currentday, sessionnumfull)
# parallel id
parallel = 0
if sessionnumfull[-1] == 'b':
parallel = 1
sessionnum = int(sessionnumfull[0])-1
# initialize
if currentday not in program:
# three sessions
program[currentday] = [[], [], []]
# five speaker slots for each parallel session
program[currentday][sessionnum] += [{'title': None,
'talks': [0, 0, 0, 0, 0]}]
gettitle = True
continue
if gettitle:
# row after session is the title
print(row['title'])
title = row['title'].strip()
program[currentday][sessionnum][parallel]['title'] = title
gettitle = False
talknum = 0
continue
# next rows until session are submissions
submissionnum = int(row["#"])
program[currentday][sessionnum][parallel]['talks'][talknum] = submissionnum
talknum += 1
# remove last session
program['Thursday'] = program['Thursday'][:2]
# check data
total = 0
for key in program:
for s in program[key]:
for p in s:
for t in p['talks']:
if t > 0:
total += 1
print("%d talks" % total)
xls = pd.ExcelFile(latestdatafile)
latestdata = xls.parse('Submissions')
latestdata = latestdata.set_index("#")
authordata = xls.parse('Authors')
for key in program:
for s in program[key]:
for p in s:
p['titles'] = ['' for i in range(5)]
p['abstract'] = ['' for i in range(5)]
p['authors'] = ['' for i in range(5)]
p['keywords'] = ['' for i in range(5)]
p['speaker'] = [None for i in range(5)]
p['webpages'] = ['' for i in range(5)]
for i, t in enumerate(p['talks']):
if t > 0:
p['titles'][i] = latestdata.loc[t]['title']
abstract = latestdata.loc[t]['abstract']
abstract = re.sub('(?<!\n)\n(?!\n)', ' ', abstract) # replace single \n
abstract = abstract.replace('\n', '<br/>') # replace \n with <br/>
abstract = re.sub('\s\s+', ' ', abstract) # remove multiple spaces
p['abstract'][i] = abstract
p['keywords'][i] = latestdata.loc[t]['keywords'].split()
df = authordata[authordata['submission #'] == t]
df = df.fillna('')
authors = [f+' '+l for f, l in zip(df['first name'], df['last name'])]
speaker = [i for i, b in enumerate(df['speaker?'].tolist()) if b.encode() == b'\xe2\x9c\x94']
if len(speaker) > 0:
speaker = speaker[0]
else:
speaker = 0
p['authors'][i] = authors
if 'oliver' in authors:
print(authors)
p['speaker'][i] = speaker
p['webpages'][i] = list(df['Web page'].values)
return program