diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py index 9b5a9fc9..44b7f589 100644 --- a/pyscraper/sp_2024/__main__.py +++ b/pyscraper/sp_2024/__main__.py @@ -16,7 +16,8 @@ file_dir = Path(__file__).parent parldata = Path(file_dir, "..", "..", "..", "parldata") -cache_dir = parldata / "cmpages" / "sp_2024" +download_dir = parldata / "cmpages" / "sp_2024" / "raw" +parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed" output_dir = parldata / "scrapedxml" / "sp-new" @@ -29,12 +30,16 @@ def cache_dir_iterator( cache_dir: Path, start_date: datetime.date, end_date: datetime.date, + partial_file_name: str | None, ): """ Return an iterator of files in the cache_dir that are between the start and end date """ for file in cache_dir.glob("*.xml"): + if partial_file_name: + if not file.name.startswith(partial_file_name): + continue # date is an iso date at the start of the filename date = datetime.date.fromisoformat(file.stem[:10]) if start_date <= date <= end_date: @@ -87,21 +92,22 @@ def debates( start.isoformat(), end.isoformat(), verbose=verbose, - cache_dir=cache_dir, + cache_dir=download_dir, override=override, ) - else: - file_iterator = cache_dir_iterator(cache_dir, start, end) + for file in file_iterator: + pass - for file in file_iterator: - if partial_file_name: - if not file.name.startswith(partial_file_name): - continue - if parse: + if parse: + file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name) + for file in file_iterator: if verbose: print(f"Parsing up {file}") - tidy_up_html(file) - if convert: + tidy_up_html(file, parsed_dir) + + if convert: + file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name) + for file in file_iterator: if verbose: print(f"Converting {file} to TheyWorkForYou format") convert_xml_to_twfy(file, output_dir, verbose=verbose) diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py index 23b441f1..52345cd1 100644 --- a/pyscraper/sp_2024/parse.py +++ b/pyscraper/sp_2024/parse.py @@ -246,7 +246,7 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup: return soup -def tidy_up_html(xml_path: Path): +def tidy_up_html(xml_path: Path, output_dir: Path): """ For each subsection there is a raw_html child This function will convert the raw_html element to a parsed child. @@ -261,15 +261,15 @@ def tidy_up_html(xml_path: Path): for item in soup.find_all("agenda_item"): agenda_item_url = item.get("url") - # delete any 'parsed' child of the subsection element - for child in item.find_all("parsed"): - child.decompose() - # process html raw_html = item.find("raw_html") parsed_data = process_raw_html(raw_html, agenda_item_url=agenda_item_url) + # replace raw_html with parsed + item.find('raw_html').decompose() item.append(parsed_data.find("parsed")) # dump the soup to a file - with xml_path.open("w") as f: + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / xml_path.name + with output_file.open("w") as f: f.write(soup.prettify())