Skip to content

Commit

Permalink
Update deps, add type stubs and checks, add pre-commit in CI
Browse files Browse the repository at this point in the history
  • Loading branch information
chriszs committed Mar 23, 2024
1 parent 563beda commit 8958f19
Show file tree
Hide file tree
Showing 21 changed files with 6,455 additions and 5,101 deletions.
50 changes: 5 additions & 45 deletions .github/workflows/continuous-deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ on:
workflow_dispatch:

jobs:
lint-python:
name: Lint Python code
pre-commit:
name: Lint and format with pre-commit
runs-on: ubuntu-latest
steps:
- id: checkout
Expand All @@ -18,50 +18,10 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: '3.9'
cache: 'pipenv'

- id: install-pipenv
name: Install pipenv
run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
shell: bash

- id: install-python-dependencies
name: Install Python dependencies
run: pipenv sync --dev
shell: bash

- id: lint
name: Lint
run: make lint

mypy-python:
name: Static type check
runs-on: ubuntu-latest
steps:
- id: checkout
name: Checkout
uses: actions/checkout@v4

- id: setup-python
name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.9'
cache: 'pipenv'

- id: install-pipenv
name: Install pipenv
run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
shell: bash

- id: install-python-dependencies
name: Install Python dependencies
run: pipenv install --dev --python=`which python`
shell: bash

- id: mypy
name: Mypy
run: make mypy
- id: pre-commit
name: Pre-commit
uses: pre-commit/[email protected]

test-docs:
name: Test Sphinx build
Expand Down
4 changes: 3 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ repos:
args: [--py37-plus]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v0.991' # Use the sha / tag you want to point at
rev: 'v1.9.0' # Use the sha / tag you want to point at
hooks:
- id: mypy
additional_dependencies:
- types-requests
- types-retry
- types-beautifulsoup4
- types-openpyxl
6 changes: 4 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ types-requests = "*"
mypy = "*"
typing-extensions = "*"
types-retry = "*"
types-beautifulsoup4 = "*"
types-openpyxl = "*"

[packages]
bs4 = "*"
beautifulsoup4 = "*"
html5lib = "*"
requests = "*"
openpyxl = "*"
Expand All @@ -40,4 +42,4 @@ retry = "*"
python_version = "3.9"

[pipenv]
allow_prereleases = true
allow_prereleases = false
1,578 changes: 1,037 additions & 541 deletions tests/cassettes/test_cached_detail_pages.yaml

Large diffs are not rendered by default.

2,111 changes: 1,729 additions & 382 deletions tests/cassettes/test_cached_search_results.yaml

Large diffs are not rendered by default.

2,598 changes: 0 additions & 2,598 deletions tests/cassettes/test_delete.yaml

This file was deleted.

760 changes: 389 additions & 371 deletions tests/cassettes/test_missing_detail_page_values.yaml

Large diffs are not rendered by default.

374 changes: 192 additions & 182 deletions tests/cassettes/test_no_results.yaml

Large diffs are not rendered by default.

2,356 changes: 1,969 additions & 387 deletions tests/cassettes/test_paged_results.yaml

Large diffs are not rendered by default.

1,576 changes: 1,037 additions & 539 deletions tests/cassettes/test_scrape_integration.yaml

Large diffs are not rendered by default.

22 changes: 16 additions & 6 deletions warn/scrapers/co.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from pathlib import Path

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag

from .. import utils
from ..cache import Cache
Expand Down Expand Up @@ -43,12 +43,18 @@ def scrape(
soup = BeautifulSoup(html, "html5lib")

# Get the link to the Google Sheet that's on the page
current_link = soup.find(class_="region-content").find("a", class_="btn-dark-blue")[
"href"
]
content_region = soup.find(class_="region-content")
if isinstance(content_region, Tag):
current_link = content_region.find("a", class_="btn-dark-blue")
else:
raise ValueError("Could not find content region")
if isinstance(current_link, Tag):
current_href = current_link["href"]
else:
raise ValueError("Could not find Google Sheet link")

# Open the Google Sheet
current_page = utils.get_url(current_link)
current_page = utils.get_url(current_href)
current_html = current_page.text

# Parse the Google Sheet
Expand All @@ -57,7 +63,11 @@ def scrape(
cleaned_data = scrape_google_sheets(table)

# Goes through the accordion links to get past data
accordion_list = soup.find(class_="region-content").find_all("dl")
content_region = soup.find(class_="region-content")
if isinstance(content_region, Tag):
accordion_list = content_region.find_all("dl")
else:
raise ValueError("Could not find content region")

# Make sure there's only one
assert len(accordion_list) == 1
Expand Down
2 changes: 1 addition & 1 deletion warn/scrapers/ct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from datetime import datetime
from pathlib import Path

from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache
Expand Down
12 changes: 8 additions & 4 deletions warn/scrapers/ga.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag

from .. import utils

Expand Down Expand Up @@ -134,14 +134,18 @@ def scrape(
# Parse detailed data
masterlist = []
for filename in glob(f"{cache_dir}/ga/*.format3"):
with open(filename, "r", encoding="utf-8") as infile:
with open(filename, encoding="utf-8") as infile:
html = infile.read()
tableholder = BeautifulSoup(html, features="html5lib").find(
"table", {"class": "gv-table-view-content"}
)
lastrowname = "Placeholder"
line = {}
for row in tableholder.find_all("tr")[1:]: # Skip header row
if isinstance(tableholder, Tag):
rows = tableholder.find_all("tr")
else:
raise ValueError("Could not find table")
for row in rows[1:]: # Skip header row
if (
row.find_all("table")
or not row.find_all("th")
Expand Down Expand Up @@ -187,7 +191,7 @@ def scrape(
"https://storage.googleapis.com/bln-data-public/warn-layoffs/ga_historical.csv"
)
utils.fetch_if_not_cached(historicalfilename, filehref)
with open(historicalfilename, "r", encoding="utf-8") as infile:
with open(historicalfilename, encoding="utf-8") as infile:
reader = list(csv.DictReader(infile))
logger.debug(f"Found {len(reader):,} historical records.")
for row in reader:
Expand Down
13 changes: 7 additions & 6 deletions warn/scrapers/hi.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,13 @@ def scrape(
rows = []
for child in selection:
parent = child.parent
for subitem in parent.prettify().split("<br/>"):
if len(subitem.strip()) > 5 and ".pdf" in subitem:
subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
row = BeautifulSoup(subitem, features="html5lib")
if row not in rows:
rows.append(row)
if parent is not None:
for subitem in parent.prettify().split("<br/>"):
if len(subitem.strip()) > 5 and ".pdf" in subitem:
subitem = subitem.replace("\xa0", " ").replace("\n", "").strip()
row = BeautifulSoup(subitem, features="html5lib")
if row not in rows:
rows.append(row)

for row in rows:
line: dict = {}
Expand Down
18 changes: 9 additions & 9 deletions warn/scrapers/la.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def _is_mostly_empty(row: list) -> bool:
return len(list(filter(pdfplumber.utils.extract_text, row))) <= 2


def _process_pdf(pdf_path):
def _process_pdf(pdf_path) -> list:
"""
Process a PDF file.
Expand All @@ -174,27 +174,27 @@ def _process_pdf(pdf_path):
for table in page.debug_tablefinder().tables:
for index, row in enumerate(table.rows):
cells = row.cells
row = [_extract_cell_chars(page, cell) for cell in cells]
cells = [_extract_cell_chars(page, cell) for cell in cells]

# If the first row in a table is mostly empty,
# append its contents to the previous row
if (
_is_first(index)
and _is_mostly_empty(row)
and _is_mostly_empty(cells)
and _has_rows(output_rows)
):
output_rows = _append_contents_to_cells_in_row_above(
output_rows, index, row
output_rows, index, cells
)
# Otherwise, if a row is mostly empty, pull data into blank cells and add current row
elif _is_mostly_empty(row):
row = _append_contents_to_row_from_row_above(
output_rows, index, row
elif _is_mostly_empty(cells):
cells = _append_contents_to_row_from_row_above(
output_rows, index, cells
)
output_rows.append(row)
output_rows.append(cells)
# Otherwise, append the row
else:
output_rows.append(row)
output_rows.append(cells)

return _clean_rows(output_rows)

Expand Down
8 changes: 6 additions & 2 deletions warn/scrapers/mt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from openpyxl import load_workbook

from .. import utils
Expand Down Expand Up @@ -38,7 +38,11 @@ def scrape(

# Parse out the Excel link
soup = BeautifulSoup(html, "html.parser")
links = soup.find(id="boardPage").find_all("a")
board_page = soup.find(id="boardPage")
if isinstance(board_page, Tag):
links = board_page.find_all("a")
else:
raise ValueError("Could not find board page")
excel_name = [
link.attrs["href"]
for link in links
Expand Down
13 changes: 8 additions & 5 deletions warn/scrapers/oh.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag

from .. import utils

Expand Down Expand Up @@ -44,11 +44,14 @@ def scrape(
r = requests.get(latesturl, headers=headers)
soup = BeautifulSoup(r.content)
logger.debug("Attempting to get JSON data from Ohio file")
mydiv = soup.find("div", {"id": "js-placeholder-json-data"})
mydata = json.loads(mydiv.decode_contents().strip())["data"]
rawheaders = mydata[1]
data_div = soup.find("div", {"id": "js-placeholder-json-data"})
if isinstance(data_div, Tag):
data = json.loads(data_div.decode_contents().strip())["data"]
else:
raise ValueError("Could not find JSON data div")
rawheaders = data[1]
masterlist = []
for row in mydata[2:]:
for row in data[2:]:
if len(row) == len(rawheaders):
line = {}
for i, item in enumerate(rawheaders):
Expand Down
20 changes: 14 additions & 6 deletions warn/scrapers/or.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from openpyxl import load_workbook

from .. import utils
Expand Down Expand Up @@ -45,7 +45,11 @@ def scrape(

# Looking for something like <input name="__RequestVerificationToken" type="hidden" value="GYlfHSHzATg5x9TZgIe...
tokenname = "__RequestVerificationToken"
tokenvalue = soup.find("input", {"name": tokenname})["value"]
tokeninput = soup.find("input", {"name": tokenname})
if isinstance(tokeninput, Tag):
tokenvalue = tokeninput["value"]
else:
raise ValueError("Could not find token input")

payload = {
tokenname: tokenvalue,
Expand Down Expand Up @@ -73,7 +77,11 @@ def scrape(
r = requests.post(starturl, cookies=cookies, data=payload, headers=requestheaders)

dlsoup = BeautifulSoup(r.content, features="html5lib")
excelurl = baseurl + dlsoup.find("a", {"class": "btn-primary"})["href"]
excellink = dlsoup.find("a", {"class": "btn-primary"})
if isinstance(excellink, Tag):
excelurl = baseurl + excellink["href"][0]
else:
raise ValueError("Could not find Excel link")
logger.debug(f"Found latest data's URL at {excelurl}")
if not excelurl:
logger.error("No URL could be found for the newest spreadsheet.")
Expand All @@ -95,8 +103,8 @@ def scrape(
for i, item in enumerate(headers):
line[item] = list(row)[i].value
if (
len(line[headers[0]]) + len(line[headers[1]])
) != 0: # Filter out blank rows
len(str(line[headers[0]])) + len(str(line[headers[1]])) != 0
): # Filter out blank rows
masterlist.append(line)
historicalurl = (
"https://storage.googleapis.com/bln-data-public/warn-layoffs/or_historical.xlsx"
Expand Down Expand Up @@ -125,7 +133,7 @@ def scrape(
for i, item in enumerate(headers):
line[item] = list(row)[i].value
if (
len(line[headers[0]]) + len(line[headers[1]])
len(str(line[headers[0]])) + len(str(line[headers[1]]))
) != 0: # Filter out blank rows
if line in masterlist:
duplicated_rows += 1
Expand Down
2 changes: 1 addition & 1 deletion warn/scrapers/ri.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
import typing
from pathlib import Path

from bs4 import BeautifulSoup
from openpyxl import load_workbook
Expand Down
8 changes: 6 additions & 2 deletions warn/scrapers/va.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from pathlib import Path

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag

from .. import utils
from ..cache import Cache
Expand Down Expand Up @@ -40,7 +40,11 @@ def scrape(

# Parse out the CSV download link
soup = BeautifulSoup(html, "html.parser")
csv_href = soup.find("a", text="Download")["href"]
csv_link = soup.find("a", text="Download")
if isinstance(csv_link, Tag):
csv_href = csv_link["href"]
else:
raise ValueError("Could not find CSV link")
csv_url = f"https://www.vec.virginia.gov{csv_href}"

# Download it to the cache
Expand Down
Loading

0 comments on commit 8958f19

Please sign in to comment.