Merge pull request #507 from biglocalnews/wi-header-fix

Fix WI header bug. Fixes #125
biglocalnews · Dec 14, 2022 · 431e864 · 431e864
2 parents 5401c7a + 6bd9a5f
commit 431e864
Show file tree

Hide file tree

Showing 6 changed files with 330 additions and 259 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -50,3 +50,4 @@ repos:
     -   id: mypy
         additional_dependencies:
           - types-requests
+          - types-retry
diff --git a/Pipfile b/Pipfile
@@ -23,6 +23,7 @@ sphinxcontrib-napoleon = "*"
 types-requests = "*"
 mypy = "*"
 typing-extensions = "*"
+types-retry = "*"
 
 [packages]
 bs4 = "*"
@@ -33,6 +34,7 @@ pdfplumber = "*"
 tenacity = "*"
 click = "*"
 xlrd = "*"
+retry = "*"
 
 [requires]
 python_version = "3.9"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/setup.py b/setup.py
@@ -135,6 +135,7 @@ def run(self):
         "openpyxl",
         "xlrd",
         "tenacity",
+        "retry",
     ],
     license="Apache 2.0 license",
     zip_safe=False,

diff --git a/warn/scrapers/wi.py b/warn/scrapers/wi.py
@@ -8,7 +8,7 @@
 from .. import utils
 from ..cache import Cache
 
-__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212"]
+__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire"]
 __tags__ = ["html"]
 __source__ = {
     "name": "Wisconsin Department of Workforce Development",
@@ -38,15 +38,24 @@ def scrape(
     today = datetime.today()
     current_year = today.year
 
+    # Get the current year of data
+    url = "https://dwd.wisconsin.gov/dislocatedworker/warn/"
+    r = utils.get_url(
+        url,
+        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
+    )
+    html = r.text
+    cache.write(f"wi/{current_year}.html", html)
+    html_list = [html,]
+
     # Set the date range we're going to scrape
     year_range = list(range(2016, current_year + 1))
     year_range.reverse()
 
     # Loop through the years and download the pages
-    html_list = []
     for year in year_range:
-        # Since the 2022 page doesn't exist yet, we're going to hack in a skip
-        if year == 2022:
+        # Since the current year page doesn't exist, we're going to hack in a skip
+        if year == current_year:
             continue
 
         # Request fresh pages, use cache for old ones
@@ -62,8 +71,19 @@ def scrape(
         # Add to the list
         html_list.append(html)
 
-    output_rows = []
-    for ihtml, html in enumerate(html_list):
+    header = [
+        "Company",
+        "City",
+        "Affected Workers",
+        "Notice Received",
+        "Original Notice Type / Update Type",
+        "Layoff Begin Date",
+        "NAICS Description",
+        "County"
+        "Workforce Development Area"
+    ]
+    output_rows = [header,]
+    for html in html_list:
         # Parse the HTML
         soup = BeautifulSoup(html, "html5lib")
 
@@ -75,18 +95,11 @@ def scrape(
         notice_tables = [t for t in table_list if len(t.find("tr").find_all("th")) > 2]
 
         # Loop through the tables
-        for itable, table in enumerate(notice_tables):
+        for table in notice_tables:
             # Get all the rows
-            row_list = table.find_all("tr")
-
-            # If this is the first table on the first page, get headers too
-            tags = ["td"]
-            if ihtml == 0 and itable == 0:
-                tags.append("th")
-
-            for row in row_list:
+            for row in table.find_all("tr"):
                 # Pull out the cells and clean them
-                cell_list = [_clean_text(c.text) for c in row.find_all(tags)]
+                cell_list = [_clean_text(c.text) for c in row.find_all(["td"])]
 
                 # Skip empty rows
                 try:

diff --git a/warn/utils.py b/warn/utils.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import requests
+from retry import retry
 from openpyxl import load_workbook
 
 logger = logging.getLogger(__name__)
@@ -98,6 +99,7 @@ def get_all_scrapers():
     )
 
 
+@retry(tries=3, delay=15, backoff=2)
 def get_url(
     url, user_agent="Big Local News (biglocalnews.org)", session=None, **kwargs
 ):
@@ -110,16 +112,23 @@ def get_url(
     """
     logger.debug(f"Requesting {url}")
 
+    # Set the headers
     if "headers" not in kwargs:
         kwargs["headers"] = {}
     kwargs["headers"]["User-Agent"] = user_agent
 
+    # Go get it
     if session is not None:
         logger.debug(f"Requesting with session {session}")
         response = session.get(url, **kwargs)
     else:
         response = requests.get(url, **kwargs)
     logger.debug(f"Response code: {response.status_code}")
+
+    # Verify that the response is 200
+    assert response.ok
+
+    # Return the response
     return response