From e52975623e2d80e2fbbdd5fdaf5193ae876a57b1 Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Wed, 17 Jan 2024 18:51:49 -0500 Subject: [PATCH] Patch HI to use Google cache for #600 I'm so sorry, @Ash1R --- warn/scrapers/hi.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/warn/scrapers/hi.py b/warn/scrapers/hi.py index 78421dd..3aac060 100644 --- a/warn/scrapers/hi.py +++ b/warn/scrapers/hi.py @@ -1,13 +1,14 @@ import datetime import logging from pathlib import Path +from urllib.parse import quote from bs4 import BeautifulSoup from .. import utils __authors__ = ["Ash1R", "stucka"] -__tags__ = ["html"] +__tags__ = ["html", "pdf"] __source__ = { "name": "Workforce Development Hawaii", "url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/", @@ -28,15 +29,17 @@ def scrape( cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR) Returns: the Path where the file is written """ - firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/") + cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A" + + firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/")) soup = BeautifulSoup(firstpage.text, features="html5lib") pagesection = soup.select("div.primary-content")[0] subpageurls = [] for atag in pagesection.find_all("a"): href = atag["href"] if href.endswith("/"): - href = href[:-1] - subpageurls.append(href) + href = href # [:-1] + subpageurls.append(cacheprefix + quote(href)) headers = ["Company", "Date", "PDF url", "location", "jobs"] data = [headers] @@ -85,8 +88,8 @@ def scrape( row.append(dates[i]) row.append(url) - row.append(None) # location - row.append(None) # jobs + row.append(None) # location + row.append(None) # jobs data.append(row) output_csv = data_dir / "hi.csv"