Skip to content

Commit

Permalink
Patch MD #674
Browse files Browse the repository at this point in the history
  • Loading branch information
stucka committed Nov 7, 2024
1 parent c3db8e7 commit af15c7a
Showing 1 changed file with 40 additions and 9 deletions.
49 changes: 40 additions & 9 deletions warn/scrapers/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,13 @@ def scrape(
# Set the cache
cache = Cache(cache_dir)

# In November 2024 Maryland began throwing out many failed connection messages. These two things helped.
request_headers = {"User-Agent": "BigLocalNews.org"}
request_verify = False

# Get the page
url = "https://www.dllr.state.md.us/employment/warn.shtml"
r = utils.get_url(url)
r = utils.get_url(url, headers=request_headers, verify=request_verify)
r.encoding = "utf-8"
html = r.text

Expand All @@ -56,17 +60,44 @@ def scrape(
html_list = []
html_list.append(html) # Save the source HTML for parsing also

old_pages = [
"warn2023.shtml",
"warn2022.shtml",
"warn2021.shtml",
"warn2020.shtml",
"warn2019.shtml",
"warn2018.shtml",
"warn2017.shtml",
"warn2016.shtml",
"warn2015.shtml",
"warn2014.shtml",
"warn2013.shtml",
"warn2012.shtml",
"warn2011.shtml",
"warn2010.shtml",
]

for href in href_list:
# Request the HTML
url = f"https://www.dllr.state.md.us/employment/{href}"
r = utils.get_url(url)
r.encoding = "utf-8"
html = r.text

# Save it to the cache
cache.write(f"md/{href}.html", html)

sleep(naptime) # Try to stop blocked connections by being less aggressive
filename = f"md/{href}.html"

if href not in old_pages:
sleep(naptime) # Try to stop blocked connections by being less aggressive
r = utils.get_url(url, headers=request_headers, verify=request_verify)
r.encoding = "utf-8"
html = r.text

# Save it to the cache
cache.write(filename, html)
else:
r = utils.fetch_if_not_cached(
cache_dir / filename,
url,
headers=request_headers,
verify=request_verify,
)
html = cache.read(filename)

# Add it to the list
html_list.append(html)
Expand Down

0 comments on commit af15c7a

Please sign in to comment.