From 3c19e8ecc2047e63b3c3d2111b704f147e96b49e Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Fri, 19 Apr 2024 16:54:08 +0100 Subject: [PATCH] fixup! Add scraper for new Scottish Parliament site --- pyscraper/sp_2024/__main__.py | 9 +++++---- pyscraper/sp_2024/convert.py | 19 ++++--------------- pyscraper/sp_2024/download.py | 6 +++--- pyscraper/sp_2024/parse.py | 3 ++- 4 files changed, 14 insertions(+), 23 deletions(-) diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py index c3302b33..f325f664 100644 --- a/pyscraper/sp_2024/__main__.py +++ b/pyscraper/sp_2024/__main__.py @@ -39,7 +39,9 @@ def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = Fal except ValueError: print(f"{date} is not a valid iso date") - for file in fetch_debates_for_date(date, verbose=verbose, override=override): + for file in fetch_debates_for_date( + date, verbose=verbose, cache_dir=cache_dir, override=override + ): tidy_up_html(file) convert_to_twfy(file, output_dir) @@ -73,12 +75,11 @@ def fetch_debates_on_date_range( ) @click.option("--end-date", help="isodate to end fetching debates at", required=True) @click.option("--verbose", is_flag=True, help="Print verbose output") -@click.option("--override", is_flag=True, help="Override existing files") def parse_debates_on_date_range( - start_date: str, end_date: str, verbose: bool = False, override: bool = False + start_date: str, end_date: str, verbose: bool = False ): """ - Download transcripts from Scottish Parliament between a start and end date + Parse and convert transcripts between a start and end date """ start = datetime.datetime.fromisoformat(start_date) end = datetime.datetime.fromisoformat(end_date) diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py index 38819036..770bb9fe 100644 --- a/pyscraper/sp_2024/convert.py +++ b/pyscraper/sp_2024/convert.py @@ -76,7 +76,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False url = source.get("url") title = source.get("title") iso_date = source.get("date") - source_id = int(float(source.get("id"))) + source_id = int(float(source.get("id")[1:])) # remove [Draft] from title title = title.replace("[Draft]", "").strip() @@ -86,24 +86,15 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False committee_slug = slugify_committee(title) - dest_path = output_dir / committee_slug / f"{iso_date}-{source_id}.xml" + dest_path = output_dir / committee_slug / f"{iso_date}_{source_id}.xml" dest_path.parent.mkdir(parents=True, exist_ok=True) id_factory = IDFactory(committee_slug=committee_slug, iso_date=iso_date) - # create a new major heading from the general description - major_heading = etree.Element("major_heading") - - major_heading.set("id", id_factory.get_next_major_id()) - major_heading.set("url", url) - major_heading.set("nospeaker", "True") - major_heading.text = f"{title} {date_str}" - root.append(major_heading) - # iterate through the agenda_items for item in source.iter("agenda_item"): - # create a new major_heading from the agenda_item information - major_heading = etree.Element("major_heading") + # create a new major-heading from the agenda_item information + major_heading = etree.Element("major-heading") major_heading.set("id", id_factory.get_next_major_id()) major_heading.set("url", item.get("url")) major_heading.set("nospeaker", "True") @@ -175,6 +166,4 @@ def convert_to_twfy( else: xmls = list(cache_dir.glob("*.xml")) for xml in xmls: - if verbose: - print(f"Tidying up {xml}") convert_xml_to_twfy(xml, output_dir, verbose=verbose) diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py index 34f295e1..75c4b526 100644 --- a/pyscraper/sp_2024/download.py +++ b/pyscraper/sp_2024/download.py @@ -147,7 +147,7 @@ def get_debate_item_content(self, speech_id: str, url: str): # create a new tree, that has a agenda_item as the root # with the id, heading and subheading as attributes - major_minor_id = f"{self.committee_date_id}.{speech_id}" + major_minor_id = f"a{self.committee_date_id}.{speech_id}" root = etree.Element( "agenda_item", @@ -190,7 +190,7 @@ def construct_xml(self) -> etree.Element: title=heading_title, date=self.date, committee=self.committee_date_slug, - id=f"{self.committee_date_id}.0", + id=f"c{self.committee_date_id}.0", ) for item in items: @@ -208,7 +208,7 @@ def save_xml(self, cache_dir: Path, override: bool = False) -> Path: if filename.exists() is False or override: xml = self.construct_xml() with filename.open("wb") as f: - f.write(etree.tostring(xml, pretty_print=True)) + f.write(etree.tostring(xml)) return filename diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py index ab6ddbef..eba26735 100644 --- a/pyscraper/sp_2024/parse.py +++ b/pyscraper/sp_2024/parse.py @@ -61,7 +61,8 @@ "map", "menu", "meta", - "noscript" "ol", + "noscript", + "ol", "p", "pre", "q",