Skip to content

Commit

Permalink
[Scotland] Store intermediate in separate files.
Browse files Browse the repository at this point in the history
  • Loading branch information
TheyWorkForYou Live CVS User committed May 3, 2024
1 parent 5b0f974 commit 5dccfc3
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 17 deletions.
28 changes: 17 additions & 11 deletions pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
file_dir = Path(__file__).parent
parldata = Path(file_dir, "..", "..", "..", "parldata")

cache_dir = parldata / "cmpages" / "sp_2024"
download_dir = parldata / "cmpages" / "sp_2024" / "raw"
parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed"
output_dir = parldata / "scrapedxml" / "sp-new"


Expand All @@ -29,12 +30,16 @@ def cache_dir_iterator(
cache_dir: Path,
start_date: datetime.date,
end_date: datetime.date,
partial_file_name: str | None,
):
"""
Return an iterator of files in the cache_dir that are between the start and end date
"""

for file in cache_dir.glob("*.xml"):
if partial_file_name:
if not file.name.startswith(partial_file_name):
continue
# date is an iso date at the start of the filename
date = datetime.date.fromisoformat(file.stem[:10])
if start_date <= date <= end_date:
Expand Down Expand Up @@ -87,21 +92,22 @@ def debates(
start.isoformat(),
end.isoformat(),
verbose=verbose,
cache_dir=cache_dir,
cache_dir=download_dir,
override=override,
)
else:
file_iterator = cache_dir_iterator(cache_dir, start, end)
for file in file_iterator:
pass

for file in file_iterator:
if partial_file_name:
if not file.name.startswith(partial_file_name):
continue
if parse:
if parse:
file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Parsing up {file}")
tidy_up_html(file)
if convert:
tidy_up_html(file, parsed_dir)

if convert:
file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Converting {file} to TheyWorkForYou format")
convert_xml_to_twfy(file, output_dir, verbose=verbose)
Expand Down
12 changes: 6 additions & 6 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
return soup


def tidy_up_html(xml_path: Path):
def tidy_up_html(xml_path: Path, output_dir: Path):
"""
For each subsection there is a raw_html child
This function will convert the raw_html element to a parsed child.
Expand All @@ -261,15 +261,15 @@ def tidy_up_html(xml_path: Path):
for item in soup.find_all("agenda_item"):
agenda_item_url = item.get("url")

# delete any 'parsed' child of the subsection element
for child in item.find_all("parsed"):
child.decompose()

# process html
raw_html = item.find("raw_html")
parsed_data = process_raw_html(raw_html, agenda_item_url=agenda_item_url)
# replace raw_html with parsed
item.find('raw_html').decompose()
item.append(parsed_data.find("parsed"))

# dump the soup to a file
with xml_path.open("w") as f:
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / xml_path.name
with output_file.open("w") as f:
f.write(soup.prettify())

0 comments on commit 5dccfc3

Please sign in to comment.