Skip to content

Commit

Permalink
Linting pre biglocalnews#625
Browse files Browse the repository at this point in the history
  • Loading branch information
stucka committed Mar 16, 2024
1 parent c57039d commit 2174e90
Show file tree
Hide file tree
Showing 7 changed files with 6 additions and 17 deletions.
2 changes: 1 addition & 1 deletion warn/scrapers/ca.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .. import utils
from ..cache import Cache

__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212", "stucka"]
__authors__ = ["zstumgoren", "Dilcia19", "ydoc5212"]
__tags__ = ["html", "pdf", "excel"]
__source__ = {
"name": "California Employment Development Department",
Expand Down
1 change: 0 additions & 1 deletion warn/scrapers/mo.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def scrape(
# Download them all
html_list = []
for year in year_range:

# Set the URL, with a hack for 2020 and 2022
url = f"https://jobs.mo.gov/warn/{year}"

Expand Down
1 change: 0 additions & 1 deletion warn/scrapers/nj.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def scrape(
for ws in wb.worksheets:
logger.debug(f"Parsing {ws}")
for i, row in enumerate(ws.rows):

# Skip header
if i == 0:
continue
Expand Down
6 changes: 2 additions & 4 deletions warn/scrapers/or.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,15 @@ def scrape(
r = requests.post(starturl, cookies=cookies, data=payload, headers=requestheaders)

dlsoup = BeautifulSoup(r.content, features="html5lib")
excelurl = (
baseurl + dlsoup.find("a", {"target": "_blank", "class": "btn-primary"})["href"]
)
excelurl = baseurl + dlsoup.find("a", {"class": "btn-primary"})["href"]
logger.debug(f"Found latest data's URL at {excelurl}")
if not excelurl:
logger.error("No URL could be found for the newest spreadsheet.")
latest_excel_path = "or/latest.xlsx"
logger.debug(f"Trying to save to, we hope, {cache_dir/latest_excel_path}")
cache.download(latest_excel_path, excelurl)

workbook = load_workbook(filename=cache_dir/latest_excel_path)
workbook = load_workbook(filename=cache_dir / latest_excel_path)
worksheet = workbook.worksheets[0]

masterlist: list = []
Expand Down
3 changes: 0 additions & 3 deletions warn/scrapers/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,8 @@ def scrape(

# Open the PDF
with pdfplumber.open(pdf_path) as pdf:

# Loop through the pages
for page in pdf.pages:

# Pull out the table
row_list = page.extract_table()

Expand All @@ -110,7 +108,6 @@ def scrape(

# Loop through each row in the table
for row in real_rows:

# Clean values
cell_list = [_clean_cell(c) for c in row if _clean_cell(c)]

Expand Down
4 changes: 0 additions & 4 deletions warn/scrapers/tn.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def scrape(

# Loop through them all, skipping the first item, which is a header
for data in data_list[1:]:

# splitting the data on its delimiter
items = str(data).split("|")

Expand Down Expand Up @@ -109,10 +108,8 @@ def scrape(

# Open the PDF
with pdfplumber.open(pdf_file) as pdf:

# Loop through all the pages
for i, my_page in enumerate(pdf.pages):

# Sll even pages have data, odd pages don't have the data
if i % 2 != 0:
continue
Expand All @@ -135,7 +132,6 @@ def scrape(

# Loop through all the rows ...
for row in row_list:

# Skip remove redundant headers
if row[0] in pdf_header_blacklist:
continue
Expand Down
6 changes: 3 additions & 3 deletions warn/scrapers/tx.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ def scrape(

# Get all the Excel links
soup = BeautifulSoup(page.text, "html5lib")
link_list = soup.find_all("a", href=re.compile("^/sites/default/files/oei/docs/warn-act-listings-"))
link_list = soup.find_all(
"a", href=re.compile("^/sites/default/files/oei/docs/warn-act-listings-")
)
logger.debug(f"{len(link_list):,} spreadsheet links found")

# Clean up the links and filter 'em down
Expand All @@ -60,7 +62,6 @@ def scrape(
# Loop through the links we want to download
row_list = []
for ihref, href in enumerate(href_list):

# get each url from the HTML links we found
data_url = f"https://www.twc.texas.gov{href}"

Expand All @@ -77,7 +78,6 @@ def scrape(

# Convert the sheet to a list of lists
for irow, row in enumerate(worksheet.rows):

# Skip headers after the first workbook
if ihref > 0 and irow == 0:
continue
Expand Down

0 comments on commit 2174e90

Please sign in to comment.