From 44d6c2c89d4e8c9929d0104ef2ebb83452742c0e Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Wed, 15 Nov 2023 16:34:53 -0500 Subject: [PATCH] Rebuild KY --- warn/scrapers/ky.py | 129 ++++++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/warn/scrapers/ky.py b/warn/scrapers/ky.py index 9280f756..ac21d2d3 100644 --- a/warn/scrapers/ky.py +++ b/warn/scrapers/ky.py @@ -1,7 +1,9 @@ +import csv +import logging import typing from pathlib import Path -import xlrd +import requests from openpyxl import load_workbook from .. import utils @@ -9,6 +11,7 @@ __authors__ = [ "palewire", + "stucka", ] __tags__ = [ "excel", @@ -18,6 +21,8 @@ "url": "https://kcc.ky.gov/employer/Pages/Business-Downsizing-Assistance---WARN.aspx", } +logger = logging.getLogger(__name__) + def scrape( data_dir: Path = utils.WARN_DATA_DIR, @@ -37,72 +42,78 @@ def scrape( """ # Get the latest workbook cache = Cache(cache_dir) - latest_url = "https://kcc.ky.gov/WARN%20notices/WARN%20NOTICES%202022/WARN%20Notice%20Report%2001262022.xls" - latest_path = cache.download("ky/latest.xls", latest_url) + hostpage = "https://kcc.ky.gov/Pages/News.aspx" + baseurl = "https://kcc.ky.gov" + headers = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0" + } + r = requests.get(hostpage, headers=headers) + html = r.text + subpage = html.split("WARN Notices by Year