This repository has been archived by the owner on Jul 22, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats2.py
211 lines (185 loc) · 8.81 KB
/
stats2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import sqlite3
import datetime
import itertools
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s.%(msecs)03d|%(message)s', datefmt='%H:%M:%S')
import pandas as pd
conn = sqlite3.connect('guardian.sqlite3')
# Overall stats to be calculated:
# longest_held (by team): the portal with the biggest current_held_td
# most_active: the portal with the smallest average_td
# weakest_by_link (by team): the portal with the smallest weakest_by_link
# most_links (by team): the portal with the most links
# weakest_by_age (by team): the portal with the lowest (health, current_held_td)
# Per-portal stats to be calculated:
# weakest_by_link: (first_obs health / (first_obs level + first_obs links))
# overall_td: first_obs obs_time - last_obs obs_time
# flip_count: number of times an observation's team was not the previous obs' team
# average_td: overall_td / flip_count
# health: first_obs health
# links: first_obs links
# current_held_td: first flip time - first observation time
def calculate_per_portal_stats(portal_id, df):
# There's a lot of wasted fetching here - we only look at the
# first and last rows for most data, and the only time we need the
# entire history is just on the team column.
# The pre-pandas version of the code worked the same way so I'm copying
# its design to get more of a head-to-head comparison. One advantage of
# pandas code is that it makes the data dependencies much more clear.
flips = df['team'].diff().astype('bool')
flips = flips[flips == True]
# Note that we are descending - the first obs is the most recent chronological one!
first_obs = df.iloc[0]
last_obs = df.iloc[-1]
overall_td = first_obs['obs_time'] - last_obs['obs_time']
health = first_obs['health']
links = first_obs['links']
level = first_obs['level']
team = first_obs['team']
weakest_by_link = health / (level + links)
if flips.size == 1:
# Never captured
flip_count = 0
average_td = pd.Timedelta.max
current_held_td = overall_td
else:
# The first row always counts as a flip because diff() inserts a NaN value when it shifts.
# Ignore it, and get the index for the row after that one.
flips = flips[1:]
first_flip = flips.index[0]
flip_count = flips.size
average_td = overall_td / flip_count
current_held_td = first_obs['obs_time'] - df.loc[first_flip]['obs_time']
return pd.DataFrame({
# portal_id is basically an index, and was originally an index,
# however it's painful/impossible to sort by both a series
# and the index value, so just make it a series i guess.
"portal_id": [portal_id],
"weakest_by_link": [weakest_by_link],
"flip_count": [flip_count],
"average_td": [average_td],
"health": [health],
"links": [links],
"level": [level],
"team": [team],
"current_held_td": [current_held_td],
})
def calculate_overall_stats(portals):
portals.sort_values(by=['team'])
most_active = portals[['portal_id', 'average_td', 'flip_count']].sort_values(by=['average_td', 'portal_id'])
most_active = most_active[most_active['flip_count'] != 0]
most_active['average_td'] = most_active['average_td'].astype('timedelta64[s]').astype('int64')
most_active.to_csv('csvs/most_active_new.csv', index=False, line_terminator='\r\n')
for team, group in portals[['team', 'portal_id', 'current_held_td']].groupby('team', sort=False):
longest_held = group.sort_values(by=['current_held_td', 'portal_id'], ascending=[False, True])
longest_held.drop(columns=['team'], inplace=True)
longest_held['current_held_td'] = longest_held['current_held_td'].astype('timedelta64[s]').astype('int64')
longest_held.to_csv('csvs/longest_held_{0}_new.csv'.format(team), index=False, line_terminator='\r\n')
for team, group in portals[['team', 'portal_id', 'weakest_by_link', 'health', 'level', 'links']].groupby('team', sort=False):
if team == 0:
continue
weakest_by_link = group.sort_values(by=['weakest_by_link', 'portal_id'])
weakest_by_link.drop(columns=['weakest_by_link', 'team'], inplace=True)
weakest_by_link.to_csv('csvs/weakest_by_link_{0}_new.csv'.format(team), index=False, line_terminator='\r\n')
for team, group in portals[['team', 'portal_id', 'links']].groupby('team', sort=False):
if team == 0:
continue
most_links = group.sort_values(by=['links', 'portal_id'], ascending=[False, False])
most_links.drop(columns=['team'], inplace=True)
most_links.to_csv('csvs/most_links_{0}_new.csv'.format(team), index=False, line_terminator='\r\n')
for team, group in portals[['portal_id', 'team', 'current_held_td', 'health']].groupby('team', sort=False):
if team == 0:
continue
weakest_by_age = group.sort_values(by=['current_held_td', 'health'], ascending=[False, False])
weakest_by_age.drop(columns=['team'], inplace=True)
weakest_by_age = weakest_by_age[weakest_by_age['health'] <= 40]
weakest_by_age['current_held_td'] = weakest_by_age['current_held_td'].astype('timedelta64[s]').astype('int64')
weakest_by_age.to_csv('csvs/weakest_by_age_{0}_new.csv'.format(team), index=False, line_terminator='\r\n')
def main():
logging.debug('Starting')
# From day one this query was going to return a lot of data,
# making it a no-go to load the entire thing into memory at once.
# Luckily, we only need a single portal's history at a time for
# number crunching so we only need a subset of rows at a time.
# The initial solution was to query a list of all portals
# and do this big UNION query in a loop, filtering in the WHERE by the
# appropriate portal each time.
# The WHERE clauses were all satisfied by indexes, so this should
# be reasonably efficient, right?
# Well, no. The program's run time was dominated by traversing the index
# over and over again for each query, it was just pure overhead.
# It's much faster to use those indexes in the ORDER BY
# and use itertools.groupby to iterate over the cursor and efficiently
# fetch rows for one portal at a time.
# Lesson learned: if you're effectively doing a full table scan in
# your code do a full table scan in the query!
df = pd.read_sql("""
SELECT
portal_id,
cast(obs_time as float) / 1000 as obs_time,
team, portal_level AS level, portal_health AS health, COALESCE(links, 0) AS links
FROM (
SELECT
portal AS portal_id
, obs_time*1000 AS obs_time
, team AS team
, level AS portal_level
, health AS portal_health
, id AS portal_obs_id
FROM portal_obs
UNION ALL
SELECT
portal AS portal_id
, timestampMs AS obs_time
, players.faction AS team
, 1 AS portal_level
, 100 AS portal_health
, NULL AS portal_obs_id
FROM captured_plexts
JOIN players ON players.id = captured_plexts.player
UNION ALL
SELECT
portal AS portal_id
-- Insert a neutral portal observation in the
-- millisecond before a portal is captured.
, timestampMs-1 AS obs_time
, 0 AS team
, 1 AS portal_level
, 0 AS portal_health
, NULL AS portal_obs_id
FROM captured_plexts
JOIN players ON players.id = captured_plexts.player
) foo
LEFT JOIN (
SELECT portal_obs_id, COUNT(*) AS links
FROM (
SELECT portal_head AS portal_obs_id FROM link_obs
UNION ALL
SELECT portal_tail AS portal_obs_id FROM link_obs
) link_obs_union
GROUP BY portal_obs_id
) link_obs USING (portal_obs_id)
ORDER BY portal_id, obs_time DESC
""", conn, parse_dates={"obs_time": {"unit": "s"}})
logging.debug('Created mega dataframe.')
df.info()
# If you put the entire query above in a single dataframe, it's a giant
# 500 mb struct that takes several minutes to populate, a non-starter
# for iterative development.
# Instead, I have python's itertools.groupby fetch rows as they're needed
# from the cursor. The initial query processing time takes ~30 sec,
# most of which is due to the subquery that calculates links.
# Part of me is disappointed that Pandas doesn't have something
# out of the box for what seems to me like an ordinary problem,
# but maybe there's something down in the intersection between
# Pandas' GroupBy and the SQL query that can get the same efficiency
per_portal_stats = []
for portal_id, observations in df.groupby("portal_id", sort=False):
per_portal_stats.append(calculate_per_portal_stats(portal_id, observations))
logging.debug('Per-portal stats finished.')
overall_df = pd.concat(per_portal_stats)
logging.debug('Portal stats concatenated.')
overall_stats = calculate_overall_stats(overall_df)
logging.debug('Done.')
if __name__ == "__main__":
main()