Skip to content

Commit

Permalink
Merge pull request #444 from chriszs/dc-gh-238-use-current-links
Browse files Browse the repository at this point in the history
fix(dc): use archive for 2014, links for others
  • Loading branch information
palewire authored Feb 27, 2022
2 parents 2c5a68a + 805892f commit 6bb13e7
Showing 1 changed file with 24 additions and 17 deletions.
41 changes: 24 additions & 17 deletions warn/scrapers/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,24 @@ def scrape(
assert len(table_list) > 0

# Grab all the links
# link_table = table_list[0]
# href_list = [a["href"] for a in link_table.find_all("a")]

# As documented in #238, only the most recent pages appear to work.
# For now, I'm going to manually set the list of links and skip scraping
# https://github.com/biglocalnews/warn-scraper/issues/238
# This should be replaced with something drawn from the scrape above
# after the the bug is fixed by DC government
href_hack = [
"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-2021",
"https://does.dc.gov/node/1468786",
"https://does.dc.gov/node/445852",
"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-0",
]
link_table = table_list[0]
link_lookup = {_extract_year(a.string): a["href"] for a in link_table.find_all("a")}

# As documented in #238, the page for 2014 is missing. We're
# testing whether the URL for the 2019 page is the same as the
# the 2014 link currently points to and scraping an archived copy
# from 2017 instead.
if link_lookup.get("2014") == link_lookup.get("2018"):
logger.warning("2014 link is the same as 2018 link, using archived 2014")
link_lookup[
"2014"
] = "https://web.archive.org/web/20170210010137/http://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-closure%202014"

# Download them all
html_list = [
root_html,
]
for href in href_hack:
for href in link_lookup.values():

# Request the HTML
r = utils.get_url(href)
Expand Down Expand Up @@ -106,8 +104,10 @@ def scrape(
# Clean them up
cell_list = [_clean_text(c.text) for c in cell_list]

# Pass them out
output_rows.append(cell_list)
# Add to the list if any cell in the row has data
# (filters out empty rows)
if any(cell_list):
output_rows.append(cell_list)

# Set the export path
data_path = data_dir / "dc.csv"
Expand All @@ -128,5 +128,12 @@ def _clean_text(text):
return text.strip()


def _extract_year(text):
"""Extract the year from the string."""
if text is None:
return None
return re.sub(r"\D", "", text)


if __name__ == "__main__":
scrape()

0 comments on commit 6bb13e7

Please sign in to comment.