biglocalnews · stucka · Mar 25, 2024 · Mar 23, 2024 · Mar 23, 2024 · Mar 25, 2024
diff --git a/.github/workflows/continuous-deployment.yml b/.github/workflows/continuous-deployment.yml
@@ -5,8 +5,8 @@ on:
   workflow_dispatch:
 
 jobs:
-  lint-python:
-    name: Lint Python code
+  pre-commit:
+    name: Lint and format with pre-commit
     runs-on: ubuntu-latest
     steps:
       - id: checkout
@@ -18,50 +18,10 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.9'
-          cache: 'pipenv'
-
-      - id: install-pipenv
-        name: Install pipenv
-        run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
-        shell: bash
-
-      - id: install-python-dependencies
-        name: Install Python dependencies
-        run: pipenv sync --dev
-        shell: bash
-
-      - id: lint
-        name: Lint
-        run: make lint
-
-  mypy-python:
-    name: Static type check
-    runs-on: ubuntu-latest
-    steps:
-      - id: checkout
-        name: Checkout
-        uses: actions/checkout@v4
-
-      - id: setup-python
-        name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.9'
-          cache: 'pipenv'
-
-      - id: install-pipenv
-        name: Install pipenv
-        run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
-        shell: bash
-
-      - id: install-python-dependencies
-        name: Install Python dependencies
-        run: pipenv install --dev --python=`which python`
-        shell: bash
 
-      - id: mypy
-        name: Mypy
-        run: make mypy
+      - id: pre-commit
+        name: Pre-commit
+        uses: pre-commit/[email protected]
 
   test-docs:
     name: Test Sphinx build

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v4.5.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
@@ -15,39 +15,41 @@ repos:
     -   id: mixed-line-ending
 
 -   repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 24.3.0
     hooks:
     -   id: black
 
 -   repo: https://github.com/asottile/blacken-docs
-    rev: v1.12.1
+    rev: 1.16.0
     hooks:
     -   id: blacken-docs
         additional_dependencies: [black]
 
 -   repo: https://github.com/timothycrosley/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
     -   id: isort
 
 -   repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
     - id: flake8
       additional_dependencies:
         - flake8-docstrings
         - flake8-bugbear
 
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.31.0
+    rev: v3.15.2
     hooks:
     -   id: pyupgrade
         args: [--py37-plus]
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v0.991'  # Use the sha / tag you want to point at
+    rev: 'v1.9.0'  # Use the sha / tag you want to point at
     hooks:
     -   id: mypy
         additional_dependencies:
           - types-requests
           - types-retry
+          - types-beautifulsoup4
+          - types-openpyxl
diff --git a/Pipfile b/Pipfile
@@ -24,9 +24,17 @@ types-requests = "*"
 mypy = "*"
 typing-extensions = "*"
 types-retry = "*"
+types-beautifulsoup4 = "*"
+types-openpyxl = "*"
+# pinned last known versions to support Sphinx 4
+sphinxcontrib-applehelp = "1.0.4"
+sphinxcontrib-devhelp = "1.0.2"
+sphinxcontrib-htmlhelp = "2.0.1"
+sphinxcontrib-qthelp = "1.0.3"
+sphinxcontrib-serializinghtml = "1.1.5"
 
 [packages]
-bs4 = "*"
+beautifulsoup4 = "*"
 html5lib = "*"
 requests = "*"
 openpyxl = "*"
@@ -35,9 +43,10 @@ tenacity = "*"
 click = "*"
 xlrd = "*"
 retry = "*"
+urllib3 = "1.26.18" # pegged to avoid test issue
 
 [requires]
 python_version = "3.9"
 
 [pipenv]
-allow_prereleases = true
+allow_prereleases = false
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/tests/cassettes/test_cached_detail_pages.yaml b/tests/cassettes/test_cached_detail_pages.yaml
diff --git a/tests/cassettes/test_cached_search_results.yaml b/tests/cassettes/test_cached_search_results.yaml
diff --git a/tests/cassettes/test_delete.yaml b/tests/cassettes/test_delete.yaml
diff --git a/tests/cassettes/test_missing_detail_page_values.yaml b/tests/cassettes/test_missing_detail_page_values.yaml
diff --git a/tests/cassettes/test_no_results.yaml b/tests/cassettes/test_no_results.yaml
diff --git a/tests/cassettes/test_paged_results.yaml b/tests/cassettes/test_paged_results.yaml
diff --git a/tests/cassettes/test_scrape_integration.yaml b/tests/cassettes/test_scrape_integration.yaml
diff --git a/warn/scrapers/co.py b/warn/scrapers/co.py
@@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 from ..cache import Cache
@@ -43,12 +43,18 @@ def scrape(
     soup = BeautifulSoup(html, "html5lib")
 
     # Get the link to the Google Sheet that's on the page
-    current_link = soup.find(class_="region-content").find("a", class_="btn-dark-blue")[
-        "href"
-    ]
+    content_region = soup.find(class_="region-content")
+    if isinstance(content_region, Tag):
+        current_link = content_region.find("a", class_="btn-dark-blue")
+    else:
+        raise ValueError("Could not find content region")
+    if isinstance(current_link, Tag):
+        current_href = current_link["href"]
+    else:
+        raise ValueError("Could not find Google Sheet link")
 
     # Open the Google Sheet
-    current_page = utils.get_url(current_link)
+    current_page = utils.get_url(current_href)
     current_html = current_page.text
 
     # Parse the Google Sheet
@@ -57,7 +63,11 @@ def scrape(
     cleaned_data = scrape_google_sheets(table)
 
     # Goes through the accordion links to get past data
-    accordion_list = soup.find(class_="region-content").find_all("dl")
+    content_region = soup.find(class_="region-content")
+    if isinstance(content_region, Tag):
+        accordion_list = content_region.find_all("dl")
+    else:
+        raise ValueError("Could not find content region")
 
     # Make sure there's only one
     assert len(accordion_list) == 1

diff --git a/warn/scrapers/ct.py b/warn/scrapers/ct.py
@@ -2,8 +2,8 @@
 from datetime import datetime
 from pathlib import Path
 
-from bs4 import BeautifulSoup
 import requests
+from bs4 import BeautifulSoup
 
 from .. import utils
 from ..cache import Cache

diff --git a/warn/scrapers/dc.py b/warn/scrapers/dc.py
@@ -71,9 +71,9 @@ def scrape(
     # from 2017 instead.
     if link_lookup.get("2014") == link_lookup.get("2018"):
         logger.warning("2014 link is the same as 2018 link, using archived 2014")
-        link_lookup[
-            "2014"
-        ] = "https://web.archive.org/web/20170210010137/http://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-closure%202014"
+        link_lookup["2014"] = (
+            "https://web.archive.org/web/20170210010137/http://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-closure%202014"
+        )
 
     # Download them all
     html_list = [

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 
@@ -134,14 +134,18 @@ def scrape(
     # Parse detailed data
     masterlist = []
     for filename in glob(f"{cache_dir}/ga/*.format3"):
-        with open(filename, "r", encoding="utf-8") as infile:
+        with open(filename, encoding="utf-8") as infile:
             html = infile.read()
         tableholder = BeautifulSoup(html, features="html5lib").find(
             "table", {"class": "gv-table-view-content"}
         )
         lastrowname = "Placeholder"
         line = {}
-        for row in tableholder.find_all("tr")[1:]:  # Skip header row
+        if isinstance(tableholder, Tag):
+            rows = tableholder.find_all("tr")
+        else:
+            raise ValueError("Could not find table")
+        for row in rows[1:]:  # Skip header row
             if (
                 row.find_all("table")
                 or not row.find_all("th")
@@ -187,7 +191,7 @@ def scrape(
         "https://storage.googleapis.com/bln-data-public/warn-layoffs/ga_historical.csv"
     )
     utils.fetch_if_not_cached(historicalfilename, filehref)
-    with open(historicalfilename, "r", encoding="utf-8") as infile:
+    with open(historicalfilename, encoding="utf-8") as infile:
         reader = list(csv.DictReader(infile))
         logger.debug(f"Found {len(reader):,} historical records.")
         for row in reader:

diff --git a/warn/scrapers/hi.py b/warn/scrapers/hi.py
@@ -82,12 +82,13 @@ def scrape(
         rows = []
         for child in selection:
             parent = child.parent
-            for subitem in parent.prettify().split("<br/>"):
-                if len(subitem.strip()) > 5 and ".pdf" in subitem:
-                    subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
-                    row = BeautifulSoup(subitem, features="html5lib")
-                    if row not in rows:
-                        rows.append(row)
+            if parent is not None:
+                for subitem in parent.prettify().split("<br/>"):
+                    if len(subitem.strip()) > 5 and ".pdf" in subitem:
+                        subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
+                        row = BeautifulSoup(subitem, features="html5lib")
+                        if row not in rows:
+                            rows.append(row)
 
         for row in rows:
             line: dict = {}

diff --git a/warn/scrapers/la.py b/warn/scrapers/la.py
@@ -158,7 +158,7 @@ def _is_mostly_empty(row: list) -> bool:
     return len(list(filter(pdfplumber.utils.extract_text, row))) <= 2
 
 
-def _process_pdf(pdf_path):
+def _process_pdf(pdf_path) -> list:
     """
     Process a PDF file.
 
@@ -174,27 +174,27 @@ def _process_pdf(pdf_path):
             for table in page.debug_tablefinder().tables:
                 for index, row in enumerate(table.rows):
                     cells = row.cells
-                    row = [_extract_cell_chars(page, cell) for cell in cells]
+                    cells = [_extract_cell_chars(page, cell) for cell in cells]
 
                     # If the first row in a table is mostly empty,
                     # append its contents to the previous row
                     if (
                         _is_first(index)
-                        and _is_mostly_empty(row)
+                        and _is_mostly_empty(cells)
                         and _has_rows(output_rows)
                     ):
                         output_rows = _append_contents_to_cells_in_row_above(
-                            output_rows, index, row
+                            output_rows, index, cells
                         )
                     # Otherwise, if a row is mostly empty, pull data into blank cells and add current row
-                    elif _is_mostly_empty(row):
-                        row = _append_contents_to_row_from_row_above(
-                            output_rows, index, row
+                    elif _is_mostly_empty(cells):
+                        cells = _append_contents_to_row_from_row_above(
+                            output_rows, index, cells
                         )
-                        output_rows.append(row)
+                        output_rows.append(cells)
                     # Otherwise, append the row
                     else:
-                        output_rows.append(row)
+                        output_rows.append(cells)
 
     return _clean_rows(output_rows)
 

diff --git a/warn/scrapers/mt.py b/warn/scrapers/mt.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from openpyxl import load_workbook
 
 from .. import utils
@@ -38,7 +38,11 @@ def scrape(
 
     # Parse out the Excel link
     soup = BeautifulSoup(html, "html.parser")
-    links = soup.find(id="boardPage").find_all("a")
+    board_page = soup.find(id="boardPage")
+    if isinstance(board_page, Tag):
+        links = board_page.find_all("a")
+    else:
+        raise ValueError("Could not find board page")
     excel_name = [
         link.attrs["href"]
         for link in links

diff --git a/warn/scrapers/oh.py b/warn/scrapers/oh.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 
@@ -44,11 +44,14 @@ def scrape(
     r = requests.get(latesturl, headers=headers)
     soup = BeautifulSoup(r.content)
     logger.debug("Attempting to get JSON data from Ohio file")
-    mydiv = soup.find("div", {"id": "js-placeholder-json-data"})
-    mydata = json.loads(mydiv.decode_contents().strip())["data"]
-    rawheaders = mydata[1]
+    data_div = soup.find("div", {"id": "js-placeholder-json-data"})
+    if isinstance(data_div, Tag):
+        data = json.loads(data_div.decode_contents().strip())["data"]
+    else:
+        raise ValueError("Could not find JSON data div")
+    rawheaders = data[1]
     masterlist = []
-    for row in mydata[2:]:
+    for row in data[2:]:
         if len(row) == len(rawheaders):
             line = {}
             for i, item in enumerate(rawheaders):