-
Notifications
You must be signed in to change notification settings - Fork 3
/
belarus_active_users.py
335 lines (284 loc) · 12.2 KB
/
belarus_active_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import bz2
import csv
import datetime
import gzip
import io
import json
import os.path
import sys
import time
from collections import defaultdict, Counter
import osmapi
import requests
import shapely.geometry
import shapely.wkt
from lxml import etree
from requests_oauthlib.oauth2_session import OAuth2Session
DATE_FROM = sys.argv[1]
DATE_TO = sys.argv[2]
DUMP_FILE = os.path.expanduser(sys.argv[3])
OSM_CLIENT_ID = sys.argv[4]
OSM_TOKEN = sys.argv[5]
OSM_CREATED_AT = int(sys.argv[6])
BBOX = [51.2626864, 32.7627809, 56.17218, 23.1783313]
B_MIN_LAT, B_MAX_LON, B_MAX_LAT, B_MIN_LON = BBOX
with open('belarus.wkt') as h:
B_GEOM = shapely.wkt.load(h)
with open('belarus-full.wkt') as h:
B_GEOM_FULL = shapely.wkt.load(h)
B_GEOM_INNER = B_GEOM.buffer(-0.01).simplify(0.005)
IGNORE_USERS = {'SomeoneElse_Revert'}
CHANGESETS_IN_BOUNDARY_CACHE_FILE = 'belarus_changeset_cache.json'
with open(CHANGESETS_IN_BOUNDARY_CACHE_FILE) as h:
CHANGESETS_IN_BOUNDARY_CACHE = {int(cid): isin for cid, isin in json.load(h).items()}
# api = osmapi.OsmApi(username=API_USER, password=API_PASS,
api = osmapi.OsmApi(session=OAuth2Session(
client_id=OSM_CLIENT_ID,
scope=['write_api', 'write_redactions'],
token= {
'access_token': OSM_TOKEN,
'token_type': 'Bearer',
'scope': ['write_api', 'write_redactions'],
'created_at': OSM_CREATED_AT,
},
))
def iter_changes_replication(top_dir, sub_dir, file_num):
url_template = 'https://planet.openstreetmap.org/replication/changesets/{:03}/{:03}/{:03}.osm.gz'
for i1 in range(top_dir, -1, -1):
for i2 in range(sub_dir, -1, -1):
print(url_template.format(i1, i2, file_num))
for i3 in range(file_num, -1, -1):
yield url_template.format(i1, i2, i3)
file_num = 999
sub_dir = 999
def iter_changes(dump):
i = 0
latest_cid = 0
main_start = start = datetime.datetime.utcnow()
with bz2.open(dump) as h:
context = etree.iterparse(h, events=('end',), tag='changeset')
for _, elem in context:
i += 1
if i % 1_000_000 == 0:
end = datetime.datetime.utcnow()
print(i, end - main_start, end - start)
start = end
attrib_get = elem.attrib.get
cid = int(attrib_get('id'))
latest_cid = max(latest_cid, cid)
yield cid, attrib_get
elem.clear()
print(latest_cid)
session = requests.session()
response = session.get('https://planet.openstreetmap.org/replication/changesets/state.yaml')
response.raise_for_status()
last_id = int(response.text.splitlines()[2].split(': ')[1])
old = 0
for url in iter_changes_replication(last_id // 1_000_000, last_id // 1000 % 1000, last_id % 1000):
for i in range(10 + 1):
try:
response = session.get(url)
response.raise_for_status()
break
except Exception as err:
print(url, err)
if i == 10:
raise
time.sleep(2**i)
with gzip.open(io.BytesIO(response.content)) as h:
context = etree.iterparse(h, events=('end',), tag='changeset')
for _, elem in context:
i += 1
if i % 1_000_000 == 0:
end = datetime.datetime.utcnow()
print(i, end - main_start, end - start)
start = end
attrib_get = elem.attrib.get
cid = int(attrib_get('id'))
if cid <= latest_cid:
old += 1
else:
old = 0
yield cid, attrib_get
elem.clear()
if old > 1000:
return
def get_bbox_geom(min_lon, min_lat, max_lon, max_lat):
if min_lon == max_lon and min_lat == max_lat:
return shapely.geometry.Point(min_lon, min_lat)
elif min_lon == max_lon or min_lat == max_lat:
return shapely.geometry.LineString([(min_lon, min_lat), (max_lon, max_lat)])
else:
return shapely.geometry.box(min_lon, min_lat, max_lon, max_lat)
def process():
data = defaultdict(list)
for cid, attrib_get in iter_changes(DUMP_FILE):
created_at = attrib_get('created_at')
closed_at = attrib_get('closed_at')
if closed_at is not None:
if not (DATE_FROM <= closed_at <= DATE_TO):
continue
elif created_at is not None:
if not (DATE_FROM <= created_at <= DATE_TO):
continue
else:
continue
user = attrib_get('user')
uid_str = attrib_get('uid')
uid = int(uid_str) if uid_str is not None else None
min_lat = float(attrib_get('min_lat', 0))
max_lat = float(attrib_get('max_lat', 0))
min_lon = float(attrib_get('min_lon', 0))
max_lon = float(attrib_get('max_lon', 0))
intersects = min_lat < B_MAX_LAT and B_MIN_LAT < max_lat and min_lon < B_MAX_LON and B_MIN_LON < max_lon
if intersects:
geom = get_bbox_geom(min_lon, min_lat, max_lon, max_lat)
if B_GEOM.intersects(geom):
data[uid].append({
'cid': cid,
'uid': uid,
'user': user,
'created_at': created_at,
'closed_at': closed_at,
'min_lat': min_lat,
'min_lon': min_lon,
'max_lat': max_lat,
'max_lon': max_lon,
})
return data
def geom_intersects(cc):
result = []
for c in cc:
geom = get_bbox_geom(c['min_lon'], c['min_lat'], c['max_lon'], c['max_lat'])
if B_GEOM.intersects(geom):
result.append(c)
return result
def geom_contains(cc):
result = []
for c in cc:
geom = get_bbox_geom(c['min_lon'], c['min_lat'], c['max_lon'], c['max_lat'])
if B_GEOM_INNER.contains(geom):
result.append(c)
return result
def count_mount(cc):
return len(set((c['closed_at'] or c['created_at'])[:7] for c in cc))
def _split_chunks(items, max_chunk_size):
for i in range(0, len(items), max_chunk_size):
yield items[i:i + max_chunk_size]
def changeset_in_boundary(cid):
print(cid)
changeset = api.ChangesetDownload(cid)
invisible_nodes = []
invisible_ways = []
invisible_rels = []
# nodes
node_ids = [change['data']['id'] for change in changeset if change['type'] == 'node']
nodes = [change['data'] for change in changeset if change['type'] == 'node']
for node in nodes:
if not node['visible']:
invisible_nodes.append(node['id'])
else:
point = shapely.geometry.Point(node['lon'], node['lat'])
if B_GEOM.contains(point) and B_GEOM_FULL.contains(point):
return True
# rels
rels = [change['data'] for change in changeset if change['type'] == 'relation']
rel_way_ids = [member['ref'] for rel in rels for member in rel['member'] if member['type'] == 'way']
rel_node_ids = [member['ref'] for rel in rels for member in rel['member'] if member['type'] == 'node']
invisible_rels.extend(rel['id'] for rel in rels if not rel['visible'])
for rel_id in invisible_rels:
history = [version for version in api.RelationHistory(rel_id).values() if version['visible']]
if history:
rel = history[-1]
rel_way_ids.extend(member['ref'] for member in rel['member'] if member['type'] == 'way')
rel_node_ids.extend(member['ref'] for member in rel['member'] if member['type'] == 'node')
rel_way_node_ids = []
for chunk_ids in _split_chunks(rel_way_ids, 725):
ways = list(api.WaysGet(chunk_ids).values())
rel_way_node_ids.extend(node for way in ways for node in way['nd'])
invisible_ways.extend(way['id'] for way in ways if not way['visible'])
# ways
ways = [change['data'] for change in changeset if change['type'] == 'way']
way_node_ids = [node for way in ways for node in way['nd']]
invisible_ways.extend(way['id'] for way in ways if not way['visible'])
for way_id in invisible_ways:
history = [version for version in api.WayHistory(way_id).values() if version['visible']]
if history:
way = history[-1]
way_node_ids.extend(way['nd'])
# way and rel nodes
all_node_ids = list(set(way_node_ids + rel_node_ids + rel_way_node_ids) - set(node_ids))
for chunk_ids in _split_chunks(all_node_ids, 725):
nodes = api.NodesGet(chunk_ids)
for node in nodes.values():
if not node['visible']:
invisible_nodes.append(node['id'])
else:
point = shapely.geometry.Point(node['lon'], node['lat'])
if B_GEOM.contains(point) and B_GEOM_FULL.contains(point):
return True
# deleted nodes
for node_id in invisible_nodes:
history = [version for version in api.NodeHistory(node_id).values() if version['visible']]
if history:
node = history[-1]
point = shapely.geometry.Point(node['lon'], node['lat'])
if B_GEOM.contains(point) and B_GEOM_FULL.contains(point):
return True
return False
def changeset_in_boundary_cached(c):
cid = c['cid']
bbox = get_bbox_geom(c['min_lon'], c['min_lat'], c['max_lon'], c['max_lat'])
# simply check is bbox intersects with boundary
if not B_GEOM.intersects(bbox):
return False
# simply check is bbox in boundary
if B_GEOM_INNER.contains(bbox):
return True
# collect changeset bodies
if cid not in CHANGESETS_IN_BOUNDARY_CACHE:
CHANGESETS_IN_BOUNDARY_CACHE[cid] = changeset_in_boundary(cid)
with open(CHANGESETS_IN_BOUNDARY_CACHE_FILE, 'w') as h:
json.dump(CHANGESETS_IN_BOUNDARY_CACHE, h, indent=2)
return CHANGESETS_IN_BOUNDARY_CACHE[cid]
if not os.path.exists('belarus_users.json'):
data = process()
with open('belarus_users.json', 'w') as h:
json.dump(data, h, indent=2, ensure_ascii=False)
with open('belarus_users.json') as h:
data = json.load(h)
data = {u: [c for c in cc if DATE_FROM <= (c['closed_at'] or c['created_at']) < DATE_TO] for u, cc in data.items()}
data_edited_3_month = {u: cc for u, cc in data.items() if count_mount(cc) >= 3}
data_edited_3_month_in_bel = {u: cc for u, cc in data.items() if count_mount(geom_contains(cc)) >= 3}
data_not_fully_checked = {u: cc for u, cc in data_edited_3_month.items() if u not in data_edited_3_month_in_bel}
data_detailed_checked = {
u: [c for c in cc if changeset_in_boundary_cached(c)]
for u, cc in data_not_fully_checked.items()
if cc[0]['user'] not in IGNORE_USERS
}
data_detailed_checked_edited_3_month = {u: cc for u, cc in data_detailed_checked.items() if count_mount(cc) >= 3}
data_final = {**data_edited_3_month_in_bel, **data_detailed_checked_edited_3_month}
print('origin', len(data), sum(len(cc) for cc in data.values()))
print('origin 3 month', len(data_edited_3_month), sum(len(cc) for cc in data_edited_3_month.values()))
print('contains 3 month', len(data_edited_3_month_in_bel), sum(len(cc) for cc in data_edited_3_month_in_bel.values()))
print('for check', len(data_not_fully_checked), sum(len(cc) for cc in data_not_fully_checked.values()))
print('checked', len(data_detailed_checked), sum(len(cc) for cc in data_detailed_checked.values()))
print('checked 3 month', len(data_detailed_checked_edited_3_month), sum(len(cc) for cc in data_detailed_checked_edited_3_month.values()))
print('final', len(data_final), sum(len(cc) for cc in data_final.values()))
dates = [
'2021-09', '2021-10', '2021-11',
'2021-12', '2022-01', '2022-02',
'2022-03', '2022-04', '2022-05',
'2022-06', '2022-07', '2022-08',
'2022-09',
]
def get_row(cc):
counter = Counter((c['closed_at'] or c['created_at'])[:7] for c in cc)
results = [cc[0]['user'], cc[0]['uid'], sum(counter.values())]
for date in dates:
results.append(counter[date])
return results
with open('belarus_active_users.csv', 'w') as h:
writer = csv.writer(h)
writer.writerow(['user', 'uid', 'sum'] + dates)
writer.writerows([get_row(cc) for cc in sorted(data_final.values(), key=lambda cc: -len(cc))])