forked from openstates/openstates-core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update.py
514 lines (422 loc) · 16.7 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
# ruff: noqa: E402
from openstates.utils.django import init_django
init_django()
import argparse
import contextlib
import datetime
import glob
import importlib
import logging
import os
import signal
import sys
import traceback
import typing
from types import ModuleType
from django.db import transaction
from openstates import settings, utils
from openstates.civiqa import civiqa_env
from openstates.civiqa.publisher import publish_os_update_finished
from openstates.cli.reports import (
ImportReport,
Plan,
Report,
ScraperReport,
generate_session_data_quality_report,
print_report,
save_report,
)
from openstates.data.models import Jurisdiction, LegislativeSession, RunPlan
from openstates.exceptions import CommandError, ScrapeError
from openstates.importers import (
BillImporter,
EventImporter,
JurisdictionImporter,
VoteEventImporter,
)
from openstates.importers.base import BaseImporter
from openstates.scrape import JurisdictionScraper, State
from openstates.scrape.base import Scraper
logger = logging.getLogger("openstates")
ALL_ACTIONS = ("scrape", "import")
class _Unset:
pass
UNSET = _Unset()
@contextlib.contextmanager
def override_settings(settings, overrides): # type: ignore
original = {}
for key, value in overrides.items():
original[key] = getattr(settings, key, UNSET)
setattr(settings, key, value)
yield
for key, value in original.items():
if value is UNSET:
delattr(settings, key)
else:
setattr(settings, key, value)
def get_jurisdiction(module_name: str) -> tuple[State, ModuleType]:
# get the state object
module = importlib.import_module(module_name)
for obj in module.__dict__.values():
# ensure we're dealing with a subclass of State
if isinstance(obj, type) and issubclass(obj, State) and obj != State:
return obj(), module
raise CommandError(f"Unable to import State subclass from {module_name}")
def do_scrape(
state: State,
legislative_session: LegislativeSession,
args: argparse.Namespace,
scraper_args_by_name: dict[str, dict[str, str]],
bill_scrape_reports: dict[str, ScraperReport] = {},
) -> dict[str, ScraperReport]:
scraper_reports: dict[str, ScraperReport] = {}
for scraper_name, scraper_args in scraper_args_by_name.items():
# make output and cache dirs
utils.makedirs(settings.CACHE_DIR)
datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module, scraper_name)
utils.makedirs(datadir)
# clear json from data dir
for f in glob.glob(datadir + "/*.json"):
os.remove(f)
ScraperClass = state.scrapers[scraper_name]
# new scraper each time
scraper: Scraper = ScraperClass(
state,
datadir,
legislative_session=legislative_session,
strict_validation=args.strict,
fastmode=args.fastmode,
realtime=args.realtime,
)
# votes scrapers depend on a successful bills scrape
if scraper_name == "votes":
bill_scrape_report = bill_scrape_reports.get(legislative_session.id)
if bill_scrape_report is None or bill_scrape_report.end is None:
raise ScrapeError(
f"Votes scraper requires successful bills scrape run for {legislative_session.identifier}"
)
scraper_reports[scraper_name] = scraper.do_scrape(**scraper_args)
return scraper_reports
def do_import(
state: State, from_scrapers: list[str], args: argparse.Namespace
) -> dict[str, typing.Any]:
jurisdiction_importer = JurisdictionImporter(state.jurisdiction_id)
bill_importer = BillImporter(state.jurisdiction_id)
vote_event_importer = VoteEventImporter(state.jurisdiction_id, bill_importer)
event_importer = EventImporter(state.jurisdiction_id, vote_event_importer)
importers_per_scraper: dict[str, list[BaseImporter]] = {
"jurisdiction": [jurisdiction_importer],
"bills": [bill_importer, vote_event_importer],
"events": [event_importer],
"votes": [vote_event_importer],
}
importers: dict[BaseImporter, str] = {}
for scraper in from_scrapers:
for importer in importers_per_scraper[scraper]:
importers[importer] = scraper
import_reports: dict[str, ImportReport] = {}
def do_importer(importer: BaseImporter, scraper_name: str) -> None:
datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module, scraper_name)
import_type = importer._type
logger.info(f"import {import_type}s...")
import_report = importer.import_directory(datadir)
import_reports[import_type] = import_report
with transaction.atomic():
for importer, scraper_name in importers.items():
do_importer(importer, scraper_name)
Jurisdiction.objects.filter(id=state.jurisdiction_id).update(
latest_bill_update=datetime.datetime.utcnow()
)
return import_reports
def check_session_list(juris: State) -> set[str]:
scraper = type(juris).__name__
# if get_session_list is not defined
if not hasattr(juris, "get_session_list"):
raise CommandError(f"{scraper}.get_session_list() is not provided")
scraped_sessions = juris.get_session_list()
if not scraped_sessions:
raise CommandError("no sessions from {}.get_session_list()".format(scraper))
active_sessions = set()
# copy the list to avoid modifying it
sessions = set(juris.ignored_scraped_sessions)
for session in juris.legislative_sessions:
sessions.add(session.get("_scraped_name", session["identifier"]))
if session.get("active"):
active_sessions.add(session.get("identifier"))
if not active_sessions:
raise CommandError(f"No active sessions on {scraper}")
unaccounted_sessions = list(set(scraped_sessions) - sessions)
if unaccounted_sessions:
raise CommandError(
(
"Session(s) {sessions} were reported by {scraper}.get_session_list() "
"but were not found in {scraper}.legislative_sessions or "
"{scraper}.ignored_scraped_sessions."
).format(sessions=", ".join(unaccounted_sessions), scraper=scraper)
)
return active_sessions
def do_update(
args: argparse.Namespace,
other_args: list[str],
state: State,
) -> list[RunPlan]:
available_scrapers = getattr(state, "scrapers", {})
scraper_args_by_name: dict[str, dict[str, str]] = {}
if not available_scrapers:
raise CommandError("no scrapers defined on jurisdiction")
available_scrapers["jurisdiction"] = JurisdictionScraper
scrapers_to_run: list[str] = []
scraper_args_by_name = {}
if other_args:
# if the cmd line specified scrapers, only run those
scraper_args_by_name = _get_custom_scraper_args(
other_args, available_scrapers, args
)
scrapers_to_run = list(scraper_args_by_name.keys())
else:
scrapers_to_run = list(available_scrapers.keys())
order = ["jurisdiction", "bills", "votes", "events"]
scrapers_to_run.sort(key=lambda s: order.index(s) if s in order else 999)
# modify args in-place so we can pass them around
if not args.actions:
args.actions = ALL_ACTIONS
run_plan_models: list[RunPlan] = []
bill_scrape_reports: dict[str, ScraperReport] = {}
for scraper_name in scrapers_to_run:
scraper_args = scraper_args_by_name.get(scraper_name, {})
legislative_sessions: list[LegislativeSession]
if scraper_name != "jurisdiction":
legislative_sessions = get_legislative_sessions(args, state)
else:
legislative_sessions = [None]
for legislative_session in legislative_sessions:
report = Report(
jurisdiction_id=state.jurisdiction_id,
legislative_session=legislative_session,
start=utils.utcnow(),
plan=Plan(
module=args.module,
actions=args.actions,
scraper_args_by_name={scraper_name: scraper_args},
),
)
_print_report(report, "Initial report")
# save the pending report
run_plan_model = save_report(report)
try:
if "scrape" in args.actions:
report.scraper_reports = do_scrape(
state,
legislative_session,
args,
report.plan.scraper_args_by_name,
bill_scrape_reports,
)
if "bills" in report.scraper_reports:
bill_scrape_reports[
legislative_session.id
] = report.scraper_reports["bills"]
if "import" in args.actions:
scraper_names = list(report.scraper_reports.keys())
report.import_reports = do_import(state, scraper_names, args)
report.success = True
except Exception as exc:
report.success = False
report.exception = exc
report.traceback = traceback.format_exc()
report.end = utils.utcnow()
run_plan_model = save_report(report, run_plan_model)
run_plan_models.append(run_plan_model)
if legislative_session:
has_votes_scraper = state.scrapers.get("votes") is not None
_set_attempted_scrape_at(
legislative_session, scraper_name, report.start
)
if scraper_name == "bills" and not has_votes_scraper:
_set_attempted_scrape_at(legislative_session, "votes", report.start)
if report.success:
_set_successful_scrape_at(
legislative_session, scraper_name, report.end
)
if scraper_name == "bills" and not has_votes_scraper:
_set_successful_scrape_at(
legislative_session, "votes", report.end
)
legislative_session.save()
if report.success:
if scraper_name in ("bills", "votes"):
generate_session_data_quality_report(
legislative_session=report.legislative_session,
run_plan=run_plan_model,
)
publish_os_update_finished(run_plan_model)
_print_report(report, "Final report")
return run_plan_models
def _set_attempted_scrape_at(
legislative_session: LegislativeSession,
scraper_name: str,
start: datetime.datetime,
) -> None:
attr = f"last_attempted_{scraper_name}_scrape_at"
if hasattr(legislative_session, attr):
setattr(legislative_session, attr, start)
legislative_session.save()
def _set_successful_scrape_at(
legislative_session: LegislativeSession,
scraper_name: str,
end: datetime.datetime,
) -> None:
attr = f"last_successful_{scraper_name}_scrape_at"
if hasattr(legislative_session, attr):
setattr(legislative_session, attr, end)
legislative_session.save()
def _print_report(report: Report, caption: str) -> None:
print()
print()
print(f"### {caption}")
print()
print_report(report)
print()
def _get_custom_scraper_args(
other_args: list[str], available_scrapers: dict[str, type], args: argparse.Namespace
) -> dict[str, dict[str, str]]:
result = {}
# parse arg list in format: (scraper (k=v )+)+
scraper_name = None
for arg in other_args:
if "=" in arg:
if not scraper_name:
raise CommandError("argument {} before scraper name".format(arg))
k, v = arg.split("=", 1)
v = v.strip("'")
result[scraper_name][k] = v
elif arg in available_scrapers:
scraper_name = arg
result[scraper_name] = {}
else:
raise CommandError(
"no such scraper: module={} scraper={}".format(args.module, arg)
)
return result
def get_legislative_sessions(args: argparse.Namespace, state: State) -> None:
legislative_sessions = []
if args.session:
legislative_sessions = LegislativeSession.objects.filter(
identifier=args.session.strip("'"),
jurisdiction_id=state.jurisdiction_id,
)
if not legislative_sessions:
raise CommandError(
f"session {args.session} not found in {state.jurisdiction_id}"
)
else:
active_sessions = check_session_list(state)
logger.info(
f"no session specified. scraping all active sessions: {active_sessions}"
)
legislative_sessions = LegislativeSession.objects.filter(
identifier__in=active_sessions, jurisdiction_id=state.jurisdiction_id
)
if not legislative_sessions:
raise CommandError("no active legislative sessions found")
return legislative_sessions
def parse_args() -> tuple[argparse.Namespace, list[str]]:
parser = argparse.ArgumentParser("openstates", description="openstates CLI")
parser.add_argument("--debug", action="store_true", help="open debugger on error")
parser.add_argument(
"--loglevel",
default="INFO",
help=(
"set log level. options are: "
"DEBUG|INFO|WARNING|ERROR|CRITICAL "
"(default is INFO)"
),
)
# what to scrape
parser.add_argument("module", type=str, help="path to scraper module")
parser.add_argument("--session", type=str, help="session to scrape")
for arg in ALL_ACTIONS:
parser.add_argument(
"--" + arg,
dest="actions",
action="append_const",
const=arg,
help="only run {} post-scrape step".format(arg),
)
# scraper arguments
parser.add_argument(
"--nonstrict",
action="store_false",
dest="strict",
help="skip validation on save",
)
parser.add_argument(
"--fastmode", action="store_true", help="use cache and turn off throttling"
)
# settings overrides
parser.add_argument("--datadir", help="data directory", dest="SCRAPED_DATA_DIR")
parser.add_argument("--cachedir", help="cache directory", dest="CACHE_DIR")
parser.add_argument(
"-r", "--rpm", help="scraper rpm", type=int, dest="SCRAPELIB_RPM"
)
parser.add_argument(
"--timeout", help="scraper timeout", type=int, dest="SCRAPELIB_TIMEOUT"
)
parser.add_argument(
"--no-verify",
help="skip tls verification",
action="store_false",
dest="SCRAPELIB_VERIFY",
)
parser.add_argument(
"--retries", help="scraper retries", type=int, dest="SCRAPELIB_RETRIES"
)
parser.add_argument(
"--retry_wait",
help="scraper retry wait",
type=int,
dest="SCRAPELIB_RETRY_WAIT_SECONDS",
)
# realtime mode
parser.add_argument("--realtime", action="store_true", help="enable realtime mode")
# process args
return parser.parse_known_args()
def main() -> int:
args, other = parse_args()
civiqa_env.load()
# set log level from command line
handler_level = getattr(logging, args.loglevel.upper(), "INFO")
settings.LOGGING["handlers"]["default"]["level"] = handler_level # type: ignore
logging.config.dictConfig(settings.LOGGING)
if args.debug:
try:
debug_module = importlib.import_module("ipdb")
except ImportError:
debug_module = importlib.import_module("pdb")
# turn on PDB-on-error mode
# stolen from http://stackoverflow.com/questions/1237379/
# if this causes problems in interactive mode check that page
def _tb_info(type, value, tb): # type: ignore
traceback.print_exception(type, value, tb)
debug_module.pm()
sys.excepthook = _tb_info
logging.info(f"Module: {args.module}")
juris, module = get_jurisdiction(args.module)
overrides = {}
overrides.update(getattr(module, "settings", {}))
overrides.update(
{key: value for key, value in vars(args).items() if value is not None}
)
with override_settings(settings, overrides):
report = do_update(args, other, juris)
if any(not r.success for r in report):
return 1
else:
return 0
def shutdown_handler(signal: int, _) -> None:
logger.info("Signal received, safely shutting down.")
print("Exiting process.", flush=True)
sys.exit(0)
if __name__ == "__main__":
signal.signal(signal.SIGINT, shutdown_handler)
sys.exit(main())