From 2c491306b60888cfd788999f8475f8ccc654b6cc Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 15 Apr 2024 11:05:03 +0000 Subject: [PATCH] Make sure date not included in committee slug. --- pyscraper/sp_2024/convert.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py index b98794bf..38819036 100644 --- a/pyscraper/sp_2024/convert.py +++ b/pyscraper/sp_2024/convert.py @@ -46,6 +46,18 @@ def get_next_minor_id(self) -> str: return self._current_id() +def slugify_committee(name: str) -> str: + """ + Convert a committee name to a slug + """ + name = slugify(name) + # if this ends in a year (four digita number) - assume it's a date and remove the last three elements + if name[-4:].isdigit(): + name = "-".join(name.split("-")[:-3]) + + return name + + def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False): """ Convert from the loose structured xml format to the @@ -72,7 +84,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False # get the date in format Thursday 9 June 2005 date_str = datetime.datetime.fromisoformat(iso_date).strftime("%A %d %B %Y") - committee_slug = slugify(title) + committee_slug = slugify_committee(title) dest_path = output_dir / committee_slug / f"{iso_date}-{source_id}.xml" dest_path.parent.mkdir(parents=True, exist_ok=True)