[Scotland] Store intermediate in separate files.

mysociety · May 3, 2024 · 5dccfc3 · 5dccfc3
1 parent 5b0f974
commit 5dccfc3
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 17 deletions.
diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
@@ -16,7 +16,8 @@
 file_dir = Path(__file__).parent
 parldata = Path(file_dir, "..", "..", "..", "parldata")
 
-cache_dir = parldata / "cmpages" / "sp_2024"
+download_dir = parldata / "cmpages" / "sp_2024" / "raw"
+parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed"
 output_dir = parldata / "scrapedxml" / "sp-new"
 
 
@@ -29,12 +30,16 @@ def cache_dir_iterator(
     cache_dir: Path,
     start_date: datetime.date,
     end_date: datetime.date,
+    partial_file_name: str | None,
 ):
     """
     Return an iterator of files in the cache_dir that are between the start and end date
     """
 
     for file in cache_dir.glob("*.xml"):
+        if partial_file_name:
+            if not file.name.startswith(partial_file_name):
+                continue
         # date is an iso date at the start of the filename
         date = datetime.date.fromisoformat(file.stem[:10])
         if start_date <= date <= end_date:
@@ -87,21 +92,22 @@ def debates(
             start.isoformat(),
             end.isoformat(),
             verbose=verbose,
-            cache_dir=cache_dir,
+            cache_dir=download_dir,
             override=override,
         )
-    else:
-        file_iterator = cache_dir_iterator(cache_dir, start, end)
+        for file in file_iterator:
+            pass
 
-    for file in file_iterator:
-        if partial_file_name:
-            if not file.name.startswith(partial_file_name):
-                continue
-        if parse:
+    if parse:
+        file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
+        for file in file_iterator:
             if verbose:
                 print(f"Parsing up {file}")
-            tidy_up_html(file)
-        if convert:
+            tidy_up_html(file, parsed_dir)
+
+    if convert:
+        file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name)
+        for file in file_iterator:
             if verbose:
                 print(f"Converting {file} to TheyWorkForYou format")
             convert_xml_to_twfy(file, output_dir, verbose=verbose)

diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -246,7 +246,7 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
     return soup
 
 
-def tidy_up_html(xml_path: Path):
+def tidy_up_html(xml_path: Path, output_dir: Path):
     """
     For each subsection there is a raw_html child
     This function will convert the raw_html element to a parsed child.
@@ -261,15 +261,15 @@ def tidy_up_html(xml_path: Path):
     for item in soup.find_all("agenda_item"):
         agenda_item_url = item.get("url")
 
-        # delete any 'parsed' child of the subsection element
-        for child in item.find_all("parsed"):
-            child.decompose()
-
         # process html
         raw_html = item.find("raw_html")
         parsed_data = process_raw_html(raw_html, agenda_item_url=agenda_item_url)
+        # replace raw_html with parsed
+        item.find('raw_html').decompose()
         item.append(parsed_data.find("parsed"))
 
     # dump the soup to a file
-    with xml_path.open("w") as f:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / xml_path.name
+    with output_file.open("w") as f:
         f.write(soup.prettify())