diff --git a/warn/scrapers/hi.py b/warn/scrapers/hi.py
index 78421dd..0955799 100644
--- a/warn/scrapers/hi.py
+++ b/warn/scrapers/hi.py
@@ -1,13 +1,15 @@
import datetime
import logging
from pathlib import Path
+from time import sleep
+from urllib.parse import quote
from bs4 import BeautifulSoup
from .. import utils
__authors__ = ["Ash1R", "stucka"]
-__tags__ = ["html"]
+__tags__ = ["html", "pdf"]
__source__ = {
"name": "Workforce Development Hawaii",
"url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
@@ -28,21 +30,34 @@ def scrape(
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
Returns: the Path where the file is written
"""
- firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
+ # Google Cache is a backup if the state re-implements its JS-enabled browser equivalent
+ usegooglecache = False
+ cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A"
+
+ firstpageurl = "https://labor.hawaii.gov/wdc/real-time-warn-updates/"
+ if usegooglecache:
+ firstpageurl = cacheprefix + quote(firstpageurl)
+
+ firstpage = utils.get_url(firstpageurl)
soup = BeautifulSoup(firstpage.text, features="html5lib")
pagesection = soup.select("div.primary-content")[0]
subpageurls = []
for atag in pagesection.find_all("a"):
href = atag["href"]
if href.endswith("/"):
- href = href[:-1]
- subpageurls.append(href)
+ href = href # [:-1]
+ subpageurl = href
+ if usegooglecache:
+ subpageurl = cacheprefix + quote(subpageurl)
+ subpageurls.append(subpageurl)
+ masterlist = []
headers = ["Company", "Date", "PDF url", "location", "jobs"]
- data = [headers]
+ # data = [headers]
# lastdateseen = "2099-12-31"
for subpageurl in reversed(subpageurls):
+ sleep(2)
# Conditionally here, we want to check and see if we have the old cached files, or if the year is current or previous.
# Only need to download if it's current or previous year.
# But do we care enough to implement right now?
@@ -50,47 +65,67 @@ def scrape(
logger.debug(f"Parsing page {subpageurl}")
page = utils.get_url(subpageurl)
soup = BeautifulSoup(page.text, features="html5lib")
+ if subpageurl.endswith("/"):
+ subpageurl = subpageurl[:-1] # Trim off the final slash, if there is one
pageyear = subpageurl.split("/")[-1][:4]
- tags = soup.select("p a[href*=pdf]")
- p_tags = [i.parent.get_text().replace("\xa0", " ").split("\n") for i in tags]
- clean_p_tags = [j for i in p_tags for j in i]
- dates = [k.split("–")[0].strip() for k in clean_p_tags]
- for i in range(len(dates)):
+ # There are at least two formats for Hawaii. In some years, each individual layoff is in a paragraph tag.
+ # In others, all the layoffs are grouped under a single paragraph tag, separated by
+ # BeautifulSoup converts that to a
.
+ # But the call to parent also repeats a bunch of entries, so we need to ensure they're not.
+ # So in more recent years, finding the parent of the "p a" there find essentially the row of data.
+ # In the older years, the parent is ... all the rows of data, which gets repeated.
+ # So take each chunk of data, find the parent, do some quality checks, clean up the text,
+ # don't engage with duplicates.
+
+ selection = soup.select("p a[href*=pdf]")
+ rows = []
+ for child in selection:
+ parent = child.parent
+ for subitem in parent.prettify().split("
"):
+ if len(subitem.strip()) > 5 and ".pdf" in subitem:
+ subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
+ row = BeautifulSoup(subitem, features="html5lib")
+ if row not in rows:
+ rows.append(row)
+
+ for row in rows:
+ line: dict = {}
+ for item in headers:
+ line[item] = None
+ graftext = row.get_text().strip()
+ tempdate = graftext
+
+ # Check to see if it's not an amendment, doesn't have 3/17/2022 date format
+ # Most dates should be like "March 17, 2022"
+ if pageyear in tempdate and f"/{pageyear}" not in tempdate:
+ try:
+ tempdate = (
+ graftext.strip().split(pageyear)[0].strip() + f" {pageyear}"
+ )
+ except ValueError:
+ print(f"Date conversion failed on row: {row}")
+
+ line["Date"] = tempdate
+
try:
- tempdate = dates[i].split(pageyear)[0].strip() + f" {pageyear}"
parsed_date = datetime.datetime.strptime(
tempdate, "%B %d, %Y"
).strftime("%Y-%m-%d")
- dates[i] = parsed_date
- # lastdateseen = parsed_date
-
- # Disabling amendment automation to shift fixes into warn-transformer instead.
- # If this needs to come back, uncomment the lastseendate references
- # then rebuild the below section as an else
+ line["Date"] = parsed_date
except ValueError:
- logger.debug(f"Date error: {dates[i]}, leaving intact")
- # if "*" in dates[i]:
- # logger.debug(
- # f"Date error: {dates[i]} as apparent amendment; saving as {lastdateseen}"
- # )
- # dates[i] = lastdateseen
- # else:
-
- for i in range(len(tags)):
- row = []
- url = tags[i].get("href")
- row.append(tags[i].get_text())
-
- row.append(dates[i])
-
- row.append(url)
- row.append(None) # location
- row.append(None) # jobs
- data.append(row)
+ logger.debug(f"Date error: '{tempdate}', leaving intact")
+
+ line["PDF url"] = row.select("a")[0].get("href")
+ line["Company"] = row.select("a")[0].get_text().strip()
+ masterlist.append(line)
+ if len(masterlist) == 0:
+ logger.error(
+ "No data scraped -- anti-scraping mechanism may be back in play -- try Google Cache?"
+ )
output_csv = data_dir / "hi.csv"
- utils.write_rows_to_csv(output_csv, data)
+ utils.write_dict_rows_to_csv(output_csv, headers, masterlist)
return output_csv