[SP] parse wrans into interim XML

mysociety · Aug 20, 2024 · c55c9c3 · c55c9c3
1 parent ffcd763
commit c55c9c3
Show file tree

Hide file tree

Showing 2 changed files with 163 additions and 0 deletions.
diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
@@ -14,6 +14,7 @@
 from .convert import convert_xml_to_twfy
 from .download import fetch_debates_for_dates, fetch_wrans_for_dates
 from .parse import tidy_up_html
+from .parse_wrans import tidy_up_wrans_html
 
 file_dir = Path(__file__).parent
 parldata = Path(file_dir, "..", "..", "..", "parldata")
@@ -167,6 +168,13 @@ def wrans(
         for file in file_iterator:
             pass
 
+    if parse:
+        file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
+        for file in file_iterator:
+            if verbose:
+                print(f"Parsing up {file}")
+            tidy_up_wrans_html(file, parsed_dir)
+
 
 if __name__ == "__main__":
     cli(prog_name="python -m pyscraper.sp_2024")
diff --git a/pyscraper/sp_2024/parse_wrans.py b/pyscraper/sp_2024/parse_wrans.py
@@ -0,0 +1,155 @@
+"""
+This module contains tools to convert the unstructured HTML of the debates into structured XML.
+This is not the TWFY style XML - but tries to retain all information from the original.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from bs4 import BeautifulSoup, Tag
+
+# HTML elements we accept moving from raw_html to parsed
+acceptable_elements = [
+    "a",
+    "abbr",
+    "acronym",
+    "address",
+    "b",
+    "big",
+    "blockquote",
+    "br",
+    "caption",
+    "center",
+    "cite",
+    "col",
+    "colgroup",
+    "dd",
+    "dir",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "i",
+    "img",
+    "li",
+    "ol",
+    "p",
+    "pre",
+    "q",
+    "s",
+    "small",
+    "span",
+    "strike",
+    "strong",
+    "sub",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "tfoot",
+    "th",
+    "thead",
+    "title",
+    "tr",
+    "tt",
+    "u",
+    "ul",
+    "timestamp",
+]
+
+
+def process_raw_html(raw_html: Tag, wrans_item_url: str) -> BeautifulSoup:
+    """
+    Given the question html, convert it to a structured xml format
+    This isn't yet matching TWFY schema or using the right IDs.
+    The goal is to make a structured file that's a bit easier to work with.
+    """
+
+    # Deal with timestamps that are not inside anything first
+    raw_html = str(raw_html)
+    soup = BeautifulSoup(raw_html, "html.parser")
+
+    # convert a structure where there's a question with a question and a reply inside
+
+    details = soup.find("ul")
+    speaker_re = re.compile(r"Asked by:\s*([^,]*),\s*MSP for\s*(\w.*)", re.MULTILINE)
+    responder_re = re.compile(r".*Answered by\s*(\w.*)\s*on", re.MULTILINE | re.DOTALL)
+    lodged_re = re.compile(r"Date lodged:\s*(\d+ \w+ \d+)", re.MULTILINE)
+    for li in details.find_all("li"):
+        text = li.text.strip()
+
+        speaker_match = re.match(speaker_re, text)
+        responder_match = re.match(responder_re, text)
+        lodged_match = re.match(lodged_re, text)
+
+        tag = None
+        match = None
+
+        if speaker_match:
+            tag = soup.new_tag("speaker")
+            speaker = f"{speaker_match.group(1)}, {speaker_match.group(2)}"
+            tag.append(speaker)
+        elif responder_match:
+            tag = soup.new_tag("responder")
+            tag.append(responder_match.group(1))
+        elif lodged_match:
+            tag = soup.new_tag("lodged")
+            tag.append(lodged_match.group(1))
+
+        if tag:
+            li.replace_with(tag)
+        else:
+            print(text)
+            li.decompose()
+
+    for h in soup.find_all("h3"):
+        text = h.find_next("div")
+        tag = None
+        if h.strong.string.strip() == "Question":
+            tag = soup.new_tag("question")
+        elif h.strong.string.strip() == "Answer":
+            tag = soup.new_tag("answer")
+        if tag:
+            h.replace_with(tag)
+            tag.append(text)
+
+    soup.find("raw_html").name = "parsed"
+
+    return soup
+
+
+def tidy_up_wrans_html(xml_path: Path, output_dir: Path):
+    """
+    For each subsection there is a raw_html child
+    This function will convert the raw_html element to a parsed child.
+    This can be rerun on already downloaded data.
+    """
+
+    with xml_path.open("r") as f:
+        xml = f.read()
+
+    soup = BeautifulSoup(xml, "html.parser")
+
+    for item in soup.find_all("question"):
+        wrans_item_url = item.get("url")
+
+        # process html
+        raw_html = item.find("raw_html")
+        parsed_data = process_raw_html(raw_html, wrans_item_url=wrans_item_url)
+        # replace raw_html with parsed
+        item.find("raw_html").decompose()
+        item.append(parsed_data.find("parsed"))
+
+    # dump the soup to a file
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / xml_path.name
+    with output_file.open("w") as f:
+        f.write(soup.prettify())