Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fixup! Add scraper for new Scottish Parliament site
Browse files Browse the repository at this point in the history
dracos committed Apr 19, 2024
1 parent 2c49130 commit 3c19e8e
Showing 4 changed files with 14 additions and 23 deletions.
9 changes: 5 additions & 4 deletions pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
@@ -39,7 +39,9 @@ def fetch_debates_on_date(date: str, verbose: bool = False, override: bool = Fal
except ValueError:
print(f"{date} is not a valid iso date")

for file in fetch_debates_for_date(date, verbose=verbose, override=override):
for file in fetch_debates_for_date(
date, verbose=verbose, cache_dir=cache_dir, override=override
):
tidy_up_html(file)
convert_to_twfy(file, output_dir)

@@ -73,12 +75,11 @@ def fetch_debates_on_date_range(
)
@click.option("--end-date", help="isodate to end fetching debates at", required=True)
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--override", is_flag=True, help="Override existing files")
def parse_debates_on_date_range(
start_date: str, end_date: str, verbose: bool = False, override: bool = False
start_date: str, end_date: str, verbose: bool = False
):
"""
Download transcripts from Scottish Parliament between a start and end date
Parse and convert transcripts between a start and end date
"""
start = datetime.datetime.fromisoformat(start_date)
end = datetime.datetime.fromisoformat(end_date)
19 changes: 4 additions & 15 deletions pyscraper/sp_2024/convert.py
Original file line number Diff line number Diff line change
@@ -76,7 +76,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
url = source.get("url")
title = source.get("title")
iso_date = source.get("date")
source_id = int(float(source.get("id")))
source_id = int(float(source.get("id")[1:]))

# remove [Draft] from title
title = title.replace("[Draft]", "").strip()
@@ -86,24 +86,15 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False

committee_slug = slugify_committee(title)

dest_path = output_dir / committee_slug / f"{iso_date}-{source_id}.xml"
dest_path = output_dir / committee_slug / f"{iso_date}_{source_id}.xml"
dest_path.parent.mkdir(parents=True, exist_ok=True)

id_factory = IDFactory(committee_slug=committee_slug, iso_date=iso_date)

# create a new major heading from the general description
major_heading = etree.Element("major_heading")

major_heading.set("id", id_factory.get_next_major_id())
major_heading.set("url", url)
major_heading.set("nospeaker", "True")
major_heading.text = f"{title} {date_str}"
root.append(major_heading)

# iterate through the agenda_items
for item in source.iter("agenda_item"):
# create a new major_heading from the agenda_item information
major_heading = etree.Element("major_heading")
# create a new major-heading from the agenda_item information
major_heading = etree.Element("major-heading")
major_heading.set("id", id_factory.get_next_major_id())
major_heading.set("url", item.get("url"))
major_heading.set("nospeaker", "True")
@@ -175,6 +166,4 @@ def convert_to_twfy(
else:
xmls = list(cache_dir.glob("*.xml"))
for xml in xmls:
if verbose:
print(f"Tidying up {xml}")
convert_xml_to_twfy(xml, output_dir, verbose=verbose)
6 changes: 3 additions & 3 deletions pyscraper/sp_2024/download.py
Original file line number Diff line number Diff line change
@@ -147,7 +147,7 @@ def get_debate_item_content(self, speech_id: str, url: str):
# create a new tree, that has a agenda_item as the root
# with the id, heading and subheading as attributes

major_minor_id = f"{self.committee_date_id}.{speech_id}"
major_minor_id = f"a{self.committee_date_id}.{speech_id}"

root = etree.Element(
"agenda_item",
@@ -190,7 +190,7 @@ def construct_xml(self) -> etree.Element:
title=heading_title,
date=self.date,
committee=self.committee_date_slug,
id=f"{self.committee_date_id}.0",
id=f"c{self.committee_date_id}.0",
)

for item in items:
@@ -208,7 +208,7 @@ def save_xml(self, cache_dir: Path, override: bool = False) -> Path:
if filename.exists() is False or override:
xml = self.construct_xml()
with filename.open("wb") as f:
f.write(etree.tostring(xml, pretty_print=True))
f.write(etree.tostring(xml))
return filename


3 changes: 2 additions & 1 deletion pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
@@ -61,7 +61,8 @@
"map",
"menu",
"meta",
"noscript" "ol",
"noscript",
"ol",
"p",
"pre",
"q",

0 comments on commit 3c19e8e

Please sign in to comment.