Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FL hidden year PDFs #447

Merged
merged 2 commits into from
Feb 27, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions warn/scrapers/fl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,20 @@ def scrape(
logger.debug(f"Request status is {response.status_code} for {url}")
soup = BeautifulSoup(response.text, "html.parser")
# find & visit each year's WARN page
links = soup.find_all(
"a", href=re.compile("^http://reactwarn.floridajobs.org/WarnList/")
)
base_url = "http://reactwarn.floridajobs.org/WarnList/"
links = soup.find_all("a", href=re.compile(base_url))
href_lookup = {_extract_year(link.text): link.get("href") for link in links}

# Loop through years and add any missing to the lookup
most_recent_year = int(list(href_lookup.keys())[0])
earliest_year = 2015 # We expect files to be available for at least 2015
for year in range(earliest_year, most_recent_year):
if str(year) not in href_lookup:
href_lookup[str(year)] = f"{base_url}viewPreviousYearsPDF?year={year}"

output_rows = []
# scraped most recent year first
for year_url in links:
year_url = year_url.get("href") # get URL from link
# Loop through years and scrape data
for year_url in href_lookup.values():
if "PDF" in year_url:
rows_to_add = _scrape_pdf(cache, cache_dir, year_url, headers)
else:
Expand All @@ -85,7 +92,7 @@ def scrape(
def _scrape_html(cache, url, headers, page=1):
urllib3.disable_warnings() # sidestep SSL error
# extract year from URL
year = re.search(r"year=([0-9]{4})", url, re.I).group(1)
year = _extract_year(url)
html_cache_key = f"fl/{year}_page_{page}.html"
current_year = datetime.date.today().year
last_year = str(current_year - 1)
Expand Down Expand Up @@ -158,7 +165,7 @@ def _scrape_pdf(cache, cache_dir, url, headers):
# sidestep SSL error
urllib3.disable_warnings()
# extract year from URL
year = re.search(r"year=([0-9]{4})", url, re.I).group(1)
year = _extract_year(url)
pdf_cache_key = f"fl/{year}.pdf"
download = ""
# download pdf if not in the cache
Expand Down Expand Up @@ -231,5 +238,12 @@ def _is_header_row(field_idx, field):
return field_idx == 0 and field == "COMPANY NAME"


def _extract_year(text):
"""Extract the year from the string."""
if text is None:
return None
return re.search(r"(\d{4})", text).group(1)


if __name__ == "__main__":
scrape()