From e9d38d2ba5e70baa2011e4b2ee67594ec6467676 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sat, 29 Jan 2022 11:31:56 -0500 Subject: [PATCH 1/3] feat(nm): add scraper Closes #73 --- warn/scrapers/nm.py | 127 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 warn/scrapers/nm.py diff --git a/warn/scrapers/nm.py b/warn/scrapers/nm.py new file mode 100644 index 00000000..0c189b4f --- /dev/null +++ b/warn/scrapers/nm.py @@ -0,0 +1,127 @@ +import logging +import os +import re +from datetime import datetime +from pathlib import Path +from typing import Optional + +import pdfplumber +from bs4 import BeautifulSoup + +from .. import utils +from ..cache import Cache + +__authors__ = ["chriszs"] +__tags__ = ["pdf"] + +logger = logging.getLogger(__name__) + + +def scrape( + data_dir: Path = utils.WARN_DATA_DIR, + cache_dir: Path = utils.WARN_CACHE_DIR, +) -> Path: + """ + Scrape data from New Mexico. + Keyword arguments: + data_dir -- the Path were the result will be saved (default WARN_DATA_DIR) + cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR) + Returns: the Path where the file is written + """ + # Fire up the cache + cache = Cache(cache_dir) + + # Download the root page + base_url = "https://www.dws.state.nm.us/" + url = f"{base_url}Rapid-Response" + page = utils.get_url(url) + html = page.text + + # Save it to the cache + state_code = "nm" + cache_key = f"{state_code}/Rapid-Response.html" + cache.write(cache_key, html) + + # Parse out the PDF links + document = BeautifulSoup(html, "html.parser") + links = document.find_all("a") + pdf_urls = [ + f"{base_url}{link['href']}" + for link in links + if "WARN" in link.get("href", "") and link.get("href", "").endswith(".pdf") + ] + + output_rows = [] + + for pdf_index, pdf_url in enumerate(pdf_urls): + file_name = os.path.basename(pdf_url) + cache_key = f"{state_code}/{file_name}" + year = _extract_year(file_name) + current_year = datetime.now().year + if cache.exists(cache_key) and year is not None and year < current_year - 1: + pdf_path = Path(cache_dir, cache_key) + else: + pdf_path = cache.download(cache_key, pdf_url) + + with pdfplumber.open(pdf_path) as pdf: + for page_index, page in enumerate(pdf.pages): + rows = page.extract_table() + + # Loop through the rows + for row_index, row in enumerate(rows): + # Skip headers on all but first page of first PDF + if pdf_index > 0 and row_index == 0: + logger.debug( + f"Skipping header row on PDF {pdf_index+1} page {page_index+1}" + ) + continue + + # Extract data + output_row = [_clean_text(cell) for cell in row] + + # Write row + if any([cell != "" for cell in output_row]): + output_rows.append(output_row) + + # Write out to CSV + data_path = data_dir / f"{state_code}.csv" + utils.write_rows_to_csv(data_path, output_rows) + + # Return the path + return data_path + + +def _clean_text(text: str) -> str: + """ + Clean up text from a PDF cell. + Keyword arguments: + text -- the text to clean + Returns: the cleaned text + """ + # Replace None with an empty string + if text is None: + return "" + + # Standardize whitespace + return re.sub(r"\s+", " ", text) + + +def _extract_year(text: str) -> Optional[int]: + """ + Extract the year from a string. + + Keyword arguments: + text -- the string to extract the year from + + Returns: the year + """ + match = re.search(r"\d{4}", text) + + if match is not None: + return int(match.group(0)) + + return None + + +if __name__ == "__main__": + scrape() From 109a369331dbb5ec3d7b538883b831a72dad5f49 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sat, 29 Jan 2022 11:32:18 -0500 Subject: [PATCH 2/3] docs(nm): update sources --- docs/sources.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sources.md b/docs/sources.md index e480c78f..26f9f380 100644 --- a/docs/sources.md +++ b/docs/sources.md @@ -1,6 +1,6 @@ # Sources -There are currently scrapers for 31 of America's 56 states and territories. +There are currently scrapers for 32 of America's 56 states and territories. | State | Docs | Authors | Tags | | :---- | :--: | :------ | :--- | @@ -22,6 +22,7 @@ There are currently scrapers for 31 of America's 56 states and territories. |[Montana](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/mt.py)||[ydoc5212](https://github.com/ydoc5212), [zstumgoren](https://github.com/zstumgoren)|excel, html| |[Nebraska](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/ne.py)||[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|html| |[New Jersey](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/nj.py)||[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|html| +|[New Mexico](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/nm.py)||[chriszs](https://github.com/chriszs)|pdf| |[New York](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/ny.py)|[📃](scrapers/ny.md)|[Dilcia19](https://github.com/Dilcia19), [ydoc5212](https://github.com/ydoc5212), [zstumgoren](https://github.com/zstumgoren)|excel, historical| |[Ohio](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/oh.py)||[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|html| |[Oklahoma](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/ok.py)|[📃](scrapers/ok.md)|[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|jobcenter| @@ -39,7 +40,7 @@ There are currently scrapers for 31 of America's 56 states and territories. ## To do -These 25 areas need a scraper: +These 24 areas need a scraper: - Arkansas - Colorado @@ -53,7 +54,6 @@ These 25 areas need a scraper: - Mississippi - Nevada - New Hampshire -- New Mexico - North Carolina - North Dakota - Pennsylvania From a5c8792ebd1e60356547511a7bbe10be4bf9c9ca Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sat, 29 Jan 2022 11:38:03 -0500 Subject: [PATCH 3/3] fix(nm): add blank lines in doc strings --- warn/scrapers/nm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/warn/scrapers/nm.py b/warn/scrapers/nm.py index 0c189b4f..c2340ace 100644 --- a/warn/scrapers/nm.py +++ b/warn/scrapers/nm.py @@ -23,9 +23,11 @@ def scrape( ) -> Path: """ Scrape data from New Mexico. + Keyword arguments: data_dir -- the Path were the result will be saved (default WARN_DATA_DIR) cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR) + Returns: the Path where the file is written """ # Fire up the cache @@ -94,8 +96,10 @@ def scrape( def _clean_text(text: str) -> str: """ Clean up text from a PDF cell. + Keyword arguments: text -- the text to clean + Returns: the cleaned text """ # Replace None with an empty string