fixup! Add scraper for new Scottish Parliament site

mysociety · Apr 19, 2024 · 3c19e8e · 3c19e8e
1 parent 2c49130
commit 3c19e8e
Showing 4 changed files with 14 additions and 23 deletions.
diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
@@ -39,7 +39,9 @@ def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = Fal
     except ValueError:
         print(f"{date} is not a valid iso date")
 
-    for file in fetch_debates_for_date(date, verbose=verbose, override=override):
+    for file in fetch_debates_for_date(
+        date, verbose=verbose, cache_dir=cache_dir, override=override
+    ):
         tidy_up_html(file)
         convert_to_twfy(file, output_dir)
 
@@ -73,12 +75,11 @@ def fetch_debates_on_date_range(
 )
 @click.option("--end-date", help="isodate to end fetching debates at", required=True)
 @click.option("--verbose", is_flag=True, help="Print verbose output")
-@click.option("--override", is_flag=True, help="Override existing files")
 def parse_debates_on_date_range(
-    start_date: str, end_date: str, verbose: bool = False, override: bool = False
+    start_date: str, end_date: str, verbose: bool = False
 ):
     """
-    Download transcripts from Scottish Parliament between a start and end date
+    Parse and convert transcripts between a start and end date
     """
     start = datetime.datetime.fromisoformat(start_date)
     end = datetime.datetime.fromisoformat(end_date)

diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py
@@ -76,7 +76,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
     url = source.get("url")
     title = source.get("title")
     iso_date = source.get("date")
-    source_id = int(float(source.get("id")))
+    source_id = int(float(source.get("id")[1:]))
 
     # remove [Draft] from title
     title = title.replace("[Draft]", "").strip()
@@ -86,24 +86,15 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
 
     committee_slug = slugify_committee(title)
 
-    dest_path = output_dir / committee_slug / f"{iso_date}-{source_id}.xml"
+    dest_path = output_dir / committee_slug / f"{iso_date}_{source_id}.xml"
     dest_path.parent.mkdir(parents=True, exist_ok=True)
 
     id_factory = IDFactory(committee_slug=committee_slug, iso_date=iso_date)
 
-    # create a new major heading from the general description
-    major_heading = etree.Element("major_heading")
-
-    major_heading.set("id", id_factory.get_next_major_id())
-    major_heading.set("url", url)
-    major_heading.set("nospeaker", "True")
-    major_heading.text = f"{title} {date_str}"
-    root.append(major_heading)
-
     # iterate through the agenda_items
     for item in source.iter("agenda_item"):
-        # create a new major_heading from the agenda_item information
-        major_heading = etree.Element("major_heading")
+        # create a new major-heading from the agenda_item information
+        major_heading = etree.Element("major-heading")
         major_heading.set("id", id_factory.get_next_major_id())
         major_heading.set("url", item.get("url"))
         major_heading.set("nospeaker", "True")
@@ -175,6 +166,4 @@ def convert_to_twfy(
     else:
         xmls = list(cache_dir.glob("*.xml"))
     for xml in xmls:
-        if verbose:
-            print(f"Tidying up {xml}")
         convert_xml_to_twfy(xml, output_dir, verbose=verbose)
diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py
@@ -147,7 +147,7 @@ def get_debate_item_content(self, speech_id: str, url: str):
         # create a new tree, that has a agenda_item as the root
         # with the id, heading and subheading as attributes
 
-        major_minor_id = f"{self.committee_date_id}.{speech_id}"
+        major_minor_id = f"a{self.committee_date_id}.{speech_id}"
 
         root = etree.Element(
             "agenda_item",
@@ -190,7 +190,7 @@ def construct_xml(self) -> etree.Element:
             title=heading_title,
             date=self.date,
             committee=self.committee_date_slug,
-            id=f"{self.committee_date_id}.0",
+            id=f"c{self.committee_date_id}.0",
         )
 
         for item in items:
@@ -208,7 +208,7 @@ def save_xml(self, cache_dir: Path, override: bool = False) -> Path:
         if filename.exists() is False or override:
             xml = self.construct_xml()
             with filename.open("wb") as f:
-                f.write(etree.tostring(xml, pretty_print=True))
+                f.write(etree.tostring(xml))
         return filename
 
 

diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -61,7 +61,8 @@
     "map",
     "menu",
     "meta",
-    "noscript" "ol",
+    "noscript",
+    "ol",
     "p",
     "pre",
     "q",
-Original file line number
+Diff line change
@@ @@ -61,7 +61,8 @@ @@
         "map",
         "menu",
         "meta",
-        "noscript" "ol",
+        "noscript",
+        "ol",
         "p",
         "pre",
         "q",