Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HI rebuild #605

Merged
merged 3 commits into from
Jan 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 72 additions & 37 deletions warn/scrapers/hi.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import datetime
import logging
from pathlib import Path
from time import sleep
from urllib.parse import quote

from bs4 import BeautifulSoup

from .. import utils

__authors__ = ["Ash1R", "stucka"]
__tags__ = ["html"]
__tags__ = ["html", "pdf"]
__source__ = {
"name": "Workforce Development Hawaii",
"url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
Expand All @@ -28,69 +30,102 @@ def scrape(
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
Returns: the Path where the file is written
"""
firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
# Google Cache is a backup if the state re-implements its JS-enabled browser equivalent
usegooglecache = False
cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A"

firstpageurl = "https://labor.hawaii.gov/wdc/real-time-warn-updates/"
if usegooglecache:
firstpageurl = cacheprefix + quote(firstpageurl)

firstpage = utils.get_url(firstpageurl)
soup = BeautifulSoup(firstpage.text, features="html5lib")
pagesection = soup.select("div.primary-content")[0]
subpageurls = []
for atag in pagesection.find_all("a"):
href = atag["href"]
if href.endswith("/"):
href = href[:-1]
subpageurls.append(href)
href = href # [:-1]
subpageurl = href
if usegooglecache:
subpageurl = cacheprefix + quote(subpageurl)
subpageurls.append(subpageurl)

masterlist = []
headers = ["Company", "Date", "PDF url", "location", "jobs"]
data = [headers]
# data = [headers]
# lastdateseen = "2099-12-31"

for subpageurl in reversed(subpageurls):
sleep(2)
# Conditionally here, we want to check and see if we have the old cached files, or if the year is current or previous.
# Only need to download if it's current or previous year.
# But do we care enough to implement right now?

logger.debug(f"Parsing page {subpageurl}")
page = utils.get_url(subpageurl)
soup = BeautifulSoup(page.text, features="html5lib")
if subpageurl.endswith("/"):
subpageurl = subpageurl[:-1] # Trim off the final slash, if there is one
pageyear = subpageurl.split("/")[-1][:4]
tags = soup.select("p a[href*=pdf]")
p_tags = [i.parent.get_text().replace("\xa0", " ").split("\n") for i in tags]
clean_p_tags = [j for i in p_tags for j in i]

dates = [k.split("–")[0].strip() for k in clean_p_tags]
for i in range(len(dates)):
# There are at least two formats for Hawaii. In some years, each individual layoff is in a paragraph tag.
# In others, all the layoffs are grouped under a single paragraph tag, separated by <br>
# BeautifulSoup converts that to a <br/>.
# But the call to parent also repeats a bunch of entries, so we need to ensure they're not.
# So in more recent years, finding the parent of the "p a" there find essentially the row of data.
# In the older years, the parent is ... all the rows of data, which gets repeated.
# So take each chunk of data, find the parent, do some quality checks, clean up the text,
# don't engage with duplicates.

selection = soup.select("p a[href*=pdf]")
rows = []
for child in selection:
parent = child.parent
for subitem in parent.prettify().split("<br/>"):
if len(subitem.strip()) > 5 and ".pdf" in subitem:
subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
row = BeautifulSoup(subitem, features="html5lib")
if row not in rows:
rows.append(row)

for row in rows:
line: dict = {}
for item in headers:
line[item] = None
graftext = row.get_text().strip()
tempdate = graftext

# Check to see if it's not an amendment, doesn't have 3/17/2022 date format
# Most dates should be like "March 17, 2022"
if pageyear in tempdate and f"/{pageyear}" not in tempdate:
try:
tempdate = (
graftext.strip().split(pageyear)[0].strip() + f" {pageyear}"
)
except ValueError:
print(f"Date conversion failed on row: {row}")

line["Date"] = tempdate

try:
tempdate = dates[i].split(pageyear)[0].strip() + f" {pageyear}"
parsed_date = datetime.datetime.strptime(
tempdate, "%B %d, %Y"
).strftime("%Y-%m-%d")
dates[i] = parsed_date
# lastdateseen = parsed_date

# Disabling amendment automation to shift fixes into warn-transformer instead.
# If this needs to come back, uncomment the lastseendate references
# then rebuild the below section as an else
line["Date"] = parsed_date
except ValueError:
logger.debug(f"Date error: {dates[i]}, leaving intact")
# if "*" in dates[i]:
# logger.debug(
# f"Date error: {dates[i]} as apparent amendment; saving as {lastdateseen}"
# )
# dates[i] = lastdateseen
# else:

for i in range(len(tags)):
row = []
url = tags[i].get("href")
row.append(tags[i].get_text())

row.append(dates[i])

row.append(url)
row.append(None) # location
row.append(None) # jobs
data.append(row)
logger.debug(f"Date error: '{tempdate}', leaving intact")

line["PDF url"] = row.select("a")[0].get("href")
line["Company"] = row.select("a")[0].get_text().strip()
masterlist.append(line)

if len(masterlist) == 0:
logger.error(
"No data scraped -- anti-scraping mechanism may be back in play -- try Google Cache?"
)
output_csv = data_dir / "hi.csv"
utils.write_rows_to_csv(output_csv, data)
utils.write_dict_rows_to_csv(output_csv, headers, masterlist)
return output_csv


Expand Down
Loading