Merge pull request #588 from biglocalnews/stucka-patch-4

Rebuild KY
biglocalnews · Nov 15, 2023 · 40de0e9 · 40de0e9
2 parents 665f549 + 44d6c2c
commit 40de0e9
Showing 1 changed file with 70 additions and 59 deletions.
diff --git a/warn/scrapers/ky.py b/warn/scrapers/ky.py
@@ -1,14 +1,17 @@
+import csv
+import logging
 import typing
 from pathlib import Path
 
-import xlrd
+import requests
 from openpyxl import load_workbook
 
 from .. import utils
 from ..cache import Cache
 
 __authors__ = [
     "palewire",
+    "stucka",
 ]
 __tags__ = [
     "excel",
@@ -18,6 +21,8 @@
     "url": "https://kcc.ky.gov/employer/Pages/Business-Downsizing-Assistance---WARN.aspx",
 }
 
+logger = logging.getLogger(__name__)
+
 
 def scrape(
     data_dir: Path = utils.WARN_DATA_DIR,
@@ -37,72 +42,78 @@ def scrape(
     """
     # Get the latest workbook
     cache = Cache(cache_dir)
-    latest_url = "https://kcc.ky.gov/WARN%20notices/WARN%20NOTICES%202022/WARN%20Notice%20Report%2001262022.xls"
-    latest_path = cache.download("ky/latest.xls", latest_url)
+    hostpage = "https://kcc.ky.gov/Pages/News.aspx"
+    baseurl = "https://kcc.ky.gov"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0"
+    }
+    r = requests.get(hostpage, headers=headers)
+    html = r.text
+    subpage = html.split("WARN Notices by Year</h4")[-1]
+    # mypy and BeautifulSoup are not cooperating. So ... extract the URL in a dumb way.
+    fragment = subpage.split('href="')[1].split('"')[0]
+    latest_url = f"{baseurl}{fragment}"
+
+    # latest_url = "https://kcc.ky.gov/WARN%20notices/WARN%20NOTICES%202022/WARN%20Notice%20Report%2001262022.xls"
+    latest_path = cache.download("ky/latest.xlsx", latest_url)
 
     # Open it up
-    latest_workbook = xlrd.open_workbook(latest_path)
-
-    # Loop through the worksheets
-    dict_list = []
-    for sheet in latest_workbook.sheet_names():
-        # Get the data as a list of lists
-        worksheet = latest_workbook.sheet_by_name(sheet)
-        sheet_list = []
-        num_cols = worksheet.ncols
-        for row_idx in range(0, worksheet.nrows):
-            row = []
-            for col_idx in range(0, num_cols):
-                cell_obj = worksheet.cell(row_idx, col_idx)
-                cell_type = worksheet.cell_type(row_idx, col_idx)
-                # Carve out hack for the 2017 tab, which has the jobs as a date instead of an int
-                if sheet == "2017" and col_idx == 5:
-                    value = cell_obj.value
-                # For all the rest of the dates
-                elif cell_type == xlrd.XL_CELL_DATE:
-                    value = xlrd.xldate.xldate_as_datetime(
-                        cell_obj.value, latest_workbook.datemode
-                    )
-                # and then the rest of it
-                else:
-                    value = cell_obj.value
-                row.append(value)
-            sheet_list.append(row)
-
-        # Convert to list of dicts
-        headers = sheet_list[0]
-        for row in sheet_list[1:]:
-            d = {}
-            for x, value in enumerate(row):
-                d[headers[x]] = value
-            dict_list.append(d)
-
-    # Same thing for the archived file
-    archive_url = "https://kcc.ky.gov/WARN%20notices/2017%20WARN%20docs/Kentucky%20%20WARN%20Report%20-Tracking%20Form%20%281998-2016%29.xlsx"
-    archive_path = cache.download("ky/archive.xlsx", archive_url)
+    workbook = load_workbook(filename=latest_path)
 
-    # Open it up
-    archive_workbook = load_workbook(filename=archive_path)
+    dirty_list: list = []
+    for sheet in workbook.worksheets:
+        localrows = parse_xlsx(sheet)
+        dirty_list.extend(localrows)
+
+    headers = dirty_list[0]
+    row_list = []
+    for rowindex, row in enumerate(dirty_list):
+        if (
+            row != headers
+        ):  # Filter out headers, but also double-check when headers may change
+            if row[0] == "Date Received":
+                logger.debug(
+                    f"Dropping dirty row that doesn't quite match headers in row {rowindex}"
+                )
+                logger.debug(f"Want: {headers}")
+                logger.debug(f"Got : {row}")
+            else:
+                line = {}
+                for i, fieldname in enumerate(headers):
+                    line[fieldname] = row[i]
+                row_list.append(line)
+    # dirty_list = None
+    logger.debug(
+        f"Successfully merged {len(row_list)-1:,} records from new spreadsheet."
+    )
+
+    # Need to double-check this archived file code, and make sure headers match
 
-    # Loop through the worksheets
-    for sheet in archive_workbook.worksheets:
-        # Get the data as a list of lists
-        sheet_list = parse_xlsx(sheet)
+    archive_url = "https://storage.googleapis.com/bln-data-public/warn-layoffs/ky-historical-normalized.csv"
 
-        # Convert to list of dicts
-        headers = sheet_list[0]
-        for row in sheet_list[1:]:
-            d = {}
-            for i, value in enumerate(row):
-                d[headers[i]] = value
-            dict_list.append(d)
+    logger.debug("Getting KY historical data")
+    r = requests.get(archive_url)
+
+    reader = list(csv.reader(r.text.splitlines()))
+
+    headerlength = len(headers)
+
+    assert reader[0][:headerlength] == headers
+    logger.debug(
+        f"Historical data matches current headers. Merging {len(reader)-1:,} records."
+    )
+
+    for row in reader[1:]:  # Skip header row
+        line = {}
+        for i, item in enumerate(headers):
+            line[item] = row[
+                i
+            ]  # Make this a list of dictionaries to match earlier input
+        row_list.append(line)
 
     # Write out the results
     data_path = data_dir / "ky.csv"
-    all_headers = list({value for dic in dict_list for value in dic.keys()})
-    utils.write_dict_rows_to_csv(
-        data_path, all_headers, dict_list, extrasaction="ignore"
-    )
+    utils.write_dict_rows_to_csv(data_path, headers, row_list, extrasaction="ignore")
 
     # Pass it out
     return data_path