Skip to content

Commit

Permalink
Merge pull request #507 from biglocalnews/wi-header-fix
Browse files Browse the repository at this point in the history
Fix WI header bug. Fixes #125
  • Loading branch information
palewire authored Dec 14, 2022
2 parents 5401c7a + 6bd9a5f commit 431e864
Show file tree
Hide file tree
Showing 6 changed files with 330 additions and 259 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,4 @@ repos:
- id: mypy
additional_dependencies:
- types-requests
- types-retry
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ sphinxcontrib-napoleon = "*"
types-requests = "*"
mypy = "*"
typing-extensions = "*"
types-retry = "*"

[packages]
bs4 = "*"
Expand All @@ -33,6 +34,7 @@ pdfplumber = "*"
tenacity = "*"
click = "*"
xlrd = "*"
retry = "*"

[requires]
python_version = "3.9"
Expand Down
531 changes: 288 additions & 243 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def run(self):
"openpyxl",
"xlrd",
"tenacity",
"retry",
],
license="Apache 2.0 license",
zip_safe=False,
Expand Down
45 changes: 29 additions & 16 deletions warn/scrapers/wi.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .. import utils
from ..cache import Cache

__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212"]
__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "palewire"]
__tags__ = ["html"]
__source__ = {
"name": "Wisconsin Department of Workforce Development",
Expand Down Expand Up @@ -38,15 +38,24 @@ def scrape(
today = datetime.today()
current_year = today.year

# Get the current year of data
url = "https://dwd.wisconsin.gov/dislocatedworker/warn/"
r = utils.get_url(
url,
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
)
html = r.text
cache.write(f"wi/{current_year}.html", html)
html_list = [html,]

# Set the date range we're going to scrape
year_range = list(range(2016, current_year + 1))
year_range.reverse()

# Loop through the years and download the pages
html_list = []
for year in year_range:
# Since the 2022 page doesn't exist yet, we're going to hack in a skip
if year == 2022:
# Since the current year page doesn't exist, we're going to hack in a skip
if year == current_year:
continue

# Request fresh pages, use cache for old ones
Expand All @@ -62,8 +71,19 @@ def scrape(
# Add to the list
html_list.append(html)

output_rows = []
for ihtml, html in enumerate(html_list):
header = [
"Company",
"City",
"Affected Workers",
"Notice Received",
"Original Notice Type / Update Type",
"Layoff Begin Date",
"NAICS Description",
"County"
"Workforce Development Area"
]
output_rows = [header,]
for html in html_list:
# Parse the HTML
soup = BeautifulSoup(html, "html5lib")

Expand All @@ -75,18 +95,11 @@ def scrape(
notice_tables = [t for t in table_list if len(t.find("tr").find_all("th")) > 2]

# Loop through the tables
for itable, table in enumerate(notice_tables):
for table in notice_tables:
# Get all the rows
row_list = table.find_all("tr")

# If this is the first table on the first page, get headers too
tags = ["td"]
if ihtml == 0 and itable == 0:
tags.append("th")

for row in row_list:
for row in table.find_all("tr"):
# Pull out the cells and clean them
cell_list = [_clean_text(c.text) for c in row.find_all(tags)]
cell_list = [_clean_text(c.text) for c in row.find_all(["td"])]

# Skip empty rows
try:
Expand Down
9 changes: 9 additions & 0 deletions warn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

import requests
from retry import retry
from openpyxl import load_workbook

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -98,6 +99,7 @@ def get_all_scrapers():
)


@retry(tries=3, delay=15, backoff=2)
def get_url(
url, user_agent="Big Local News (biglocalnews.org)", session=None, **kwargs
):
Expand All @@ -110,16 +112,23 @@ def get_url(
"""
logger.debug(f"Requesting {url}")

# Set the headers
if "headers" not in kwargs:
kwargs["headers"] = {}
kwargs["headers"]["User-Agent"] = user_agent

# Go get it
if session is not None:
logger.debug(f"Requesting with session {session}")
response = session.get(url, **kwargs)
else:
response = requests.get(url, **kwargs)
logger.debug(f"Response code: {response.status_code}")

# Verify that the response is 200
assert response.ok

# Return the response
return response


Expand Down

0 comments on commit 431e864

Please sign in to comment.