Update deps, add type stubs and checks, add pre-commit in CI

biglocalnews · Mar 23, 2024 · 8958f19 · 8958f19
1 parent 563beda
commit 8958f19
Show file tree

Hide file tree

Showing 21 changed files with 6,455 additions and 5,101 deletions.
diff --git a/.github/workflows/continuous-deployment.yml b/.github/workflows/continuous-deployment.yml
@@ -5,8 +5,8 @@ on:
   workflow_dispatch:
 
 jobs:
-  lint-python:
-    name: Lint Python code
+  pre-commit:
+    name: Lint and format with pre-commit
     runs-on: ubuntu-latest
     steps:
       - id: checkout
@@ -18,50 +18,10 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.9'
-          cache: 'pipenv'
-
-      - id: install-pipenv
-        name: Install pipenv
-        run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
-        shell: bash
-
-      - id: install-python-dependencies
-        name: Install Python dependencies
-        run: pipenv sync --dev
-        shell: bash
-
-      - id: lint
-        name: Lint
-        run: make lint
-
-  mypy-python:
-    name: Static type check
-    runs-on: ubuntu-latest
-    steps:
-      - id: checkout
-        name: Checkout
-        uses: actions/checkout@v4
-
-      - id: setup-python
-        name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.9'
-          cache: 'pipenv'
-
-      - id: install-pipenv
-        name: Install pipenv
-        run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
-        shell: bash
-
-      - id: install-python-dependencies
-        name: Install Python dependencies
-        run: pipenv install --dev --python=`which python`
-        shell: bash
 
-      - id: mypy
-        name: Mypy
-        run: make mypy
+      - id: pre-commit
+        name: Pre-commit
+        uses: pre-commit/[email protected]
 
   test-docs:
     name: Test Sphinx build

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,9 +45,11 @@ repos:
         args: [--py37-plus]
 
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v0.991'  # Use the sha / tag you want to point at
+    rev: 'v1.9.0'  # Use the sha / tag you want to point at
     hooks:
     -   id: mypy
         additional_dependencies:
           - types-requests
           - types-retry
+          - types-beautifulsoup4
+          - types-openpyxl
diff --git a/Pipfile b/Pipfile
@@ -24,9 +24,11 @@ types-requests = "*"
 mypy = "*"
 typing-extensions = "*"
 types-retry = "*"
+types-beautifulsoup4 = "*"
+types-openpyxl = "*"
 
 [packages]
-bs4 = "*"
+beautifulsoup4 = "*"
 html5lib = "*"
 requests = "*"
 openpyxl = "*"
@@ -40,4 +42,4 @@ retry = "*"
 python_version = "3.9"
 
 [pipenv]
-allow_prereleases = true
+allow_prereleases = false
diff --git a/tests/cassettes/test_cached_detail_pages.yaml b/tests/cassettes/test_cached_detail_pages.yaml
diff --git a/tests/cassettes/test_cached_search_results.yaml b/tests/cassettes/test_cached_search_results.yaml
diff --git a/tests/cassettes/test_delete.yaml b/tests/cassettes/test_delete.yaml
diff --git a/tests/cassettes/test_missing_detail_page_values.yaml b/tests/cassettes/test_missing_detail_page_values.yaml
diff --git a/tests/cassettes/test_no_results.yaml b/tests/cassettes/test_no_results.yaml
diff --git a/tests/cassettes/test_paged_results.yaml b/tests/cassettes/test_paged_results.yaml
diff --git a/tests/cassettes/test_scrape_integration.yaml b/tests/cassettes/test_scrape_integration.yaml
diff --git a/warn/scrapers/co.py b/warn/scrapers/co.py
@@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 from ..cache import Cache
@@ -43,12 +43,18 @@ def scrape(
     soup = BeautifulSoup(html, "html5lib")
 
     # Get the link to the Google Sheet that's on the page
-    current_link = soup.find(class_="region-content").find("a", class_="btn-dark-blue")[
-        "href"
-    ]
+    content_region = soup.find(class_="region-content")
+    if isinstance(content_region, Tag):
+        current_link = content_region.find("a", class_="btn-dark-blue")
+    else:
+        raise ValueError("Could not find content region")
+    if isinstance(current_link, Tag):
+        current_href = current_link["href"]
+    else:
+        raise ValueError("Could not find Google Sheet link")
 
     # Open the Google Sheet
-    current_page = utils.get_url(current_link)
+    current_page = utils.get_url(current_href)
     current_html = current_page.text
 
     # Parse the Google Sheet
@@ -57,7 +63,11 @@ def scrape(
     cleaned_data = scrape_google_sheets(table)
 
     # Goes through the accordion links to get past data
-    accordion_list = soup.find(class_="region-content").find_all("dl")
+    content_region = soup.find(class_="region-content")
+    if isinstance(content_region, Tag):
+        accordion_list = content_region.find_all("dl")
+    else:
+        raise ValueError("Could not find content region")
 
     # Make sure there's only one
     assert len(accordion_list) == 1

diff --git a/warn/scrapers/ct.py b/warn/scrapers/ct.py
@@ -2,8 +2,8 @@
 from datetime import datetime
 from pathlib import Path
 
-from bs4 import BeautifulSoup
 import requests
+from bs4 import BeautifulSoup
 
 from .. import utils
 from ..cache import Cache

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 
@@ -134,14 +134,18 @@ def scrape(
     # Parse detailed data
     masterlist = []
     for filename in glob(f"{cache_dir}/ga/*.format3"):
-        with open(filename, "r", encoding="utf-8") as infile:
+        with open(filename, encoding="utf-8") as infile:
             html = infile.read()
         tableholder = BeautifulSoup(html, features="html5lib").find(
             "table", {"class": "gv-table-view-content"}
         )
         lastrowname = "Placeholder"
         line = {}
-        for row in tableholder.find_all("tr")[1:]:  # Skip header row
+        if isinstance(tableholder, Tag):
+            rows = tableholder.find_all("tr")
+        else:
+            raise ValueError("Could not find table")
+        for row in rows[1:]:  # Skip header row
             if (
                 row.find_all("table")
                 or not row.find_all("th")
@@ -187,7 +191,7 @@ def scrape(
         "https://storage.googleapis.com/bln-data-public/warn-layoffs/ga_historical.csv"
     )
     utils.fetch_if_not_cached(historicalfilename, filehref)
-    with open(historicalfilename, "r", encoding="utf-8") as infile:
+    with open(historicalfilename, encoding="utf-8") as infile:
         reader = list(csv.DictReader(infile))
         logger.debug(f"Found {len(reader):,} historical records.")
         for row in reader:

diff --git a/warn/scrapers/hi.py b/warn/scrapers/hi.py
@@ -82,12 +82,13 @@ def scrape(
         rows = []
         for child in selection:
             parent = child.parent
-            for subitem in parent.prettify().split("<br/>"):
-                if len(subitem.strip()) > 5 and ".pdf" in subitem:
-                    subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
-                    row = BeautifulSoup(subitem, features="html5lib")
-                    if row not in rows:
-                        rows.append(row)
+            if parent is not None:
+                for subitem in parent.prettify().split("<br/>"):
+                    if len(subitem.strip()) > 5 and ".pdf" in subitem:
+                        subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
+                        row = BeautifulSoup(subitem, features="html5lib")
+                        if row not in rows:
+                            rows.append(row)
 
         for row in rows:
             line: dict = {}

diff --git a/warn/scrapers/la.py b/warn/scrapers/la.py
@@ -158,7 +158,7 @@ def _is_mostly_empty(row: list) -> bool:
     return len(list(filter(pdfplumber.utils.extract_text, row))) <= 2
 
 
-def _process_pdf(pdf_path):
+def _process_pdf(pdf_path) -> list:
     """
     Process a PDF file.
 
@@ -174,27 +174,27 @@ def _process_pdf(pdf_path):
             for table in page.debug_tablefinder().tables:
                 for index, row in enumerate(table.rows):
                     cells = row.cells
-                    row = [_extract_cell_chars(page, cell) for cell in cells]
+                    cells = [_extract_cell_chars(page, cell) for cell in cells]
 
                     # If the first row in a table is mostly empty,
                     # append its contents to the previous row
                     if (
                         _is_first(index)
-                        and _is_mostly_empty(row)
+                        and _is_mostly_empty(cells)
                         and _has_rows(output_rows)
                     ):
                         output_rows = _append_contents_to_cells_in_row_above(
-                            output_rows, index, row
+                            output_rows, index, cells
                         )
                     # Otherwise, if a row is mostly empty, pull data into blank cells and add current row
-                    elif _is_mostly_empty(row):
-                        row = _append_contents_to_row_from_row_above(
-                            output_rows, index, row
+                    elif _is_mostly_empty(cells):
+                        cells = _append_contents_to_row_from_row_above(
+                            output_rows, index, cells
                         )
-                        output_rows.append(row)
+                        output_rows.append(cells)
                     # Otherwise, append the row
                     else:
-                        output_rows.append(row)
+                        output_rows.append(cells)
 
     return _clean_rows(output_rows)
 

diff --git a/warn/scrapers/mt.py b/warn/scrapers/mt.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from openpyxl import load_workbook
 
 from .. import utils
@@ -38,7 +38,11 @@ def scrape(
 
     # Parse out the Excel link
     soup = BeautifulSoup(html, "html.parser")
-    links = soup.find(id="boardPage").find_all("a")
+    board_page = soup.find(id="boardPage")
+    if isinstance(board_page, Tag):
+        links = board_page.find_all("a")
+    else:
+        raise ValueError("Could not find board page")
     excel_name = [
         link.attrs["href"]
         for link in links

diff --git a/warn/scrapers/oh.py b/warn/scrapers/oh.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 
@@ -44,11 +44,14 @@ def scrape(
     r = requests.get(latesturl, headers=headers)
     soup = BeautifulSoup(r.content)
     logger.debug("Attempting to get JSON data from Ohio file")
-    mydiv = soup.find("div", {"id": "js-placeholder-json-data"})
-    mydata = json.loads(mydiv.decode_contents().strip())["data"]
-    rawheaders = mydata[1]
+    data_div = soup.find("div", {"id": "js-placeholder-json-data"})
+    if isinstance(data_div, Tag):
+        data = json.loads(data_div.decode_contents().strip())["data"]
+    else:
+        raise ValueError("Could not find JSON data div")
+    rawheaders = data[1]
     masterlist = []
-    for row in mydata[2:]:
+    for row in data[2:]:
         if len(row) == len(rawheaders):
             line = {}
             for i, item in enumerate(rawheaders):

diff --git a/warn/scrapers/or.py b/warn/scrapers/or.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from openpyxl import load_workbook
 
 from .. import utils
@@ -45,7 +45,11 @@ def scrape(
 
     # Looking for something like <input name="__RequestVerificationToken" type="hidden" value="GYlfHSHzATg5x9TZgIe...
     tokenname = "__RequestVerificationToken"
-    tokenvalue = soup.find("input", {"name": tokenname})["value"]
+    tokeninput = soup.find("input", {"name": tokenname})
+    if isinstance(tokeninput, Tag):
+        tokenvalue = tokeninput["value"]
+    else:
+        raise ValueError("Could not find token input")
 
     payload = {
         tokenname: tokenvalue,
@@ -73,7 +77,11 @@ def scrape(
     r = requests.post(starturl, cookies=cookies, data=payload, headers=requestheaders)
 
     dlsoup = BeautifulSoup(r.content, features="html5lib")
-    excelurl = baseurl + dlsoup.find("a", {"class": "btn-primary"})["href"]
+    excellink = dlsoup.find("a", {"class": "btn-primary"})
+    if isinstance(excellink, Tag):
+        excelurl = baseurl + excellink["href"][0]
+    else:
+        raise ValueError("Could not find Excel link")
     logger.debug(f"Found latest data's URL at {excelurl}")
     if not excelurl:
         logger.error("No URL could be found for the newest spreadsheet.")
@@ -95,8 +103,8 @@ def scrape(
         for i, item in enumerate(headers):
             line[item] = list(row)[i].value
         if (
-            len(line[headers[0]]) + len(line[headers[1]])
-        ) != 0:  # Filter out blank rows
+            len(str(line[headers[0]])) + len(str(line[headers[1]])) != 0
+        ):  # Filter out blank rows
             masterlist.append(line)
     historicalurl = (
         "https://storage.googleapis.com/bln-data-public/warn-layoffs/or_historical.xlsx"
@@ -125,7 +133,7 @@ def scrape(
         for i, item in enumerate(headers):
             line[item] = list(row)[i].value
         if (
-            len(line[headers[0]]) + len(line[headers[1]])
+            len(str(line[headers[0]])) + len(str(line[headers[1]]))
         ) != 0:  # Filter out blank rows
             if line in masterlist:
                 duplicated_rows += 1

diff --git a/warn/scrapers/ri.py b/warn/scrapers/ri.py
@@ -1,6 +1,6 @@
 import logging
-from pathlib import Path
 import typing
+from pathlib import Path
 
 from bs4 import BeautifulSoup
 from openpyxl import load_workbook

diff --git a/warn/scrapers/va.py b/warn/scrapers/va.py
@@ -1,7 +1,7 @@
 import logging
 from pathlib import Path
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 from .. import utils
 from ..cache import Cache
@@ -40,7 +40,11 @@ def scrape(
 
     # Parse out the CSV download link
     soup = BeautifulSoup(html, "html.parser")
-    csv_href = soup.find("a", text="Download")["href"]
+    csv_link = soup.find("a", text="Download")
+    if isinstance(csv_link, Tag):
+        csv_href = csv_link["href"]
+    else:
+        raise ValueError("Could not find CSV link")
     csv_url = f"https://www.vec.virginia.gov{csv_href}"
 
     # Download it to the cache