From 3c19e8ecc2047e63b3c3d2111b704f147e96b49e Mon Sep 17 00:00:00 2001
From: Matthew Somerville <matthew@mysociety.org>
Date: Fri, 19 Apr 2024 16:54:08 +0100
Subject: [PATCH] fixup! Add scraper for new Scottish Parliament site

---
 pyscraper/sp_2024/__main__.py |  9 +++++----
 pyscraper/sp_2024/convert.py  | 19 ++++---------------
 pyscraper/sp_2024/download.py |  6 +++---
 pyscraper/sp_2024/parse.py    |  3 ++-
 4 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
index c3302b33..f325f664 100644
--- a/pyscraper/sp_2024/__main__.py
+++ b/pyscraper/sp_2024/__main__.py
@@ -39,7 +39,9 @@ def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = Fal
     except ValueError:
         print(f"{date} is not a valid iso date")
 
-    for file in fetch_debates_for_date(date, verbose=verbose, override=override):
+    for file in fetch_debates_for_date(
+        date, verbose=verbose, cache_dir=cache_dir, override=override
+    ):
         tidy_up_html(file)
         convert_to_twfy(file, output_dir)
 
@@ -73,12 +75,11 @@ def fetch_debates_on_date_range(
 )
 @click.option("--end-date", help="isodate to end fetching debates at", required=True)
 @click.option("--verbose", is_flag=True, help="Print verbose output")
-@click.option("--override", is_flag=True, help="Override existing files")
 def parse_debates_on_date_range(
-    start_date: str, end_date: str, verbose: bool = False, override: bool = False
+    start_date: str, end_date: str, verbose: bool = False
 ):
     """
-    Download transcripts from Scottish Parliament between a start and end date
+    Parse and convert transcripts between a start and end date
     """
     start = datetime.datetime.fromisoformat(start_date)
     end = datetime.datetime.fromisoformat(end_date)
diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py
index 38819036..770bb9fe 100644
--- a/pyscraper/sp_2024/convert.py
+++ b/pyscraper/sp_2024/convert.py
@@ -76,7 +76,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
     url = source.get("url")
     title = source.get("title")
     iso_date = source.get("date")
-    source_id = int(float(source.get("id")))
+    source_id = int(float(source.get("id")[1:]))
 
     # remove [Draft] from title
     title = title.replace("[Draft]", "").strip()
@@ -86,24 +86,15 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
 
     committee_slug = slugify_committee(title)
 
-    dest_path = output_dir / committee_slug / f"{iso_date}-{source_id}.xml"
+    dest_path = output_dir / committee_slug / f"{iso_date}_{source_id}.xml"
     dest_path.parent.mkdir(parents=True, exist_ok=True)
 
     id_factory = IDFactory(committee_slug=committee_slug, iso_date=iso_date)
 
-    # create a new major heading from the general description
-    major_heading = etree.Element("major_heading")
-
-    major_heading.set("id", id_factory.get_next_major_id())
-    major_heading.set("url", url)
-    major_heading.set("nospeaker", "True")
-    major_heading.text = f"{title} {date_str}"
-    root.append(major_heading)
-
     # iterate through the agenda_items
     for item in source.iter("agenda_item"):
-        # create a new major_heading from the agenda_item information
-        major_heading = etree.Element("major_heading")
+        # create a new major-heading from the agenda_item information
+        major_heading = etree.Element("major-heading")
         major_heading.set("id", id_factory.get_next_major_id())
         major_heading.set("url", item.get("url"))
         major_heading.set("nospeaker", "True")
@@ -175,6 +166,4 @@ def convert_to_twfy(
     else:
         xmls = list(cache_dir.glob("*.xml"))
     for xml in xmls:
-        if verbose:
-            print(f"Tidying up {xml}")
         convert_xml_to_twfy(xml, output_dir, verbose=verbose)
diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py
index 34f295e1..75c4b526 100644
--- a/pyscraper/sp_2024/download.py
+++ b/pyscraper/sp_2024/download.py
@@ -147,7 +147,7 @@ def get_debate_item_content(self, speech_id: str, url: str):
         # create a new tree, that has a agenda_item as the root
         # with the id, heading and subheading as attributes
 
-        major_minor_id = f"{self.committee_date_id}.{speech_id}"
+        major_minor_id = f"a{self.committee_date_id}.{speech_id}"
 
         root = etree.Element(
             "agenda_item",
@@ -190,7 +190,7 @@ def construct_xml(self) -> etree.Element:
             title=heading_title,
             date=self.date,
             committee=self.committee_date_slug,
-            id=f"{self.committee_date_id}.0",
+            id=f"c{self.committee_date_id}.0",
         )
 
         for item in items:
@@ -208,7 +208,7 @@ def save_xml(self, cache_dir: Path, override: bool = False) -> Path:
         if filename.exists() is False or override:
             xml = self.construct_xml()
             with filename.open("wb") as f:
-                f.write(etree.tostring(xml, pretty_print=True))
+                f.write(etree.tostring(xml))
         return filename
 
 
diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
index ab6ddbef..eba26735 100644
--- a/pyscraper/sp_2024/parse.py
+++ b/pyscraper/sp_2024/parse.py
@@ -61,7 +61,8 @@
     "map",
     "menu",
     "meta",
-    "noscript" "ol",
+    "noscript",
+    "ol",
     "p",
     "pre",
     "q",