From e9d38d2ba5e70baa2011e4b2ee67594ec6467676 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sat, 29 Jan 2022 11:31:56 -0500
Subject: [PATCH 1/3] feat(nm): add scraper Closes #73

---
 warn/scrapers/nm.py | 127 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 warn/scrapers/nm.py

diff --git a/warn/scrapers/nm.py b/warn/scrapers/nm.py
new file mode 100644
index 00000000..0c189b4f
--- /dev/null
+++ b/warn/scrapers/nm.py
@@ -0,0 +1,127 @@
+import logging
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import pdfplumber
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+
+__authors__ = ["chriszs"]
+__tags__ = ["pdf"]
+
+logger = logging.getLogger(__name__)
+
+
+def scrape(
+    data_dir: Path = utils.WARN_DATA_DIR,
+    cache_dir: Path = utils.WARN_CACHE_DIR,
+) -> Path:
+    """
+    Scrape data from New Mexico.
+    Keyword arguments:
+    data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
+    cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
+    Returns: the Path where the file is written
+    """
+    # Fire up the cache
+    cache = Cache(cache_dir)
+
+    # Download the root page
+    base_url = "https://www.dws.state.nm.us/"
+    url = f"{base_url}Rapid-Response"
+    page = utils.get_url(url)
+    html = page.text
+
+    # Save it to the cache
+    state_code = "nm"
+    cache_key = f"{state_code}/Rapid-Response.html"
+    cache.write(cache_key, html)
+
+    # Parse out the PDF links
+    document = BeautifulSoup(html, "html.parser")
+    links = document.find_all("a")
+    pdf_urls = [
+        f"{base_url}{link['href']}"
+        for link in links
+        if "WARN" in link.get("href", "") and link.get("href", "").endswith(".pdf")
+    ]
+
+    output_rows = []
+
+    for pdf_index, pdf_url in enumerate(pdf_urls):
+        file_name = os.path.basename(pdf_url)
+        cache_key = f"{state_code}/{file_name}"
+        year = _extract_year(file_name)
+        current_year = datetime.now().year
+        if cache.exists(cache_key) and year is not None and year < current_year - 1:
+            pdf_path = Path(cache_dir, cache_key)
+        else:
+            pdf_path = cache.download(cache_key, pdf_url)
+
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_index, page in enumerate(pdf.pages):
+                rows = page.extract_table()
+
+                # Loop through the rows
+                for row_index, row in enumerate(rows):
+                    # Skip headers on all but first page of first PDF
+                    if pdf_index > 0 and row_index == 0:
+                        logger.debug(
+                            f"Skipping header row on PDF {pdf_index+1} page {page_index+1}"
+                        )
+                        continue
+
+                    # Extract data
+                    output_row = [_clean_text(cell) for cell in row]
+
+                    # Write row
+                    if any([cell != "" for cell in output_row]):
+                        output_rows.append(output_row)
+
+    # Write out to CSV
+    data_path = data_dir / f"{state_code}.csv"
+    utils.write_rows_to_csv(data_path, output_rows)
+
+    # Return the path
+    return data_path
+
+
+def _clean_text(text: str) -> str:
+    """
+    Clean up text from a PDF cell.
+    Keyword arguments:
+    text -- the text to clean
+    Returns: the cleaned text
+    """
+    # Replace None with an empty string
+    if text is None:
+        return ""
+
+    # Standardize whitespace
+    return re.sub(r"\s+", " ", text)
+
+
+def _extract_year(text: str) -> Optional[int]:
+    """
+    Extract the year from a string.
+
+    Keyword arguments:
+    text -- the string to extract the year from
+
+    Returns: the year
+    """
+    match = re.search(r"\d{4}", text)
+
+    if match is not None:
+        return int(match.group(0))
+
+    return None
+
+
+if __name__ == "__main__":
+    scrape()

From 109a369331dbb5ec3d7b538883b831a72dad5f49 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sat, 29 Jan 2022 11:32:18 -0500
Subject: [PATCH 2/3] docs(nm): update sources

---
 docs/sources.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/sources.md b/docs/sources.md
index e480c78f..26f9f380 100644
--- a/docs/sources.md
+++ b/docs/sources.md
@@ -1,6 +1,6 @@
 # Sources
 
-There are currently scrapers for 31 of America's 56 states and territories.
+There are currently scrapers for 32 of America's 56 states and territories.
 
 | State | Docs | Authors | Tags |
 | :---- | :--: | :------ | :--- |
@@ -22,6 +22,7 @@ There are currently scrapers for 31 of America's 56 states and territories.
 |[Montana](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/mt.py)||[ydoc5212](https://github.com/ydoc5212), [zstumgoren](https://github.com/zstumgoren)|excel, html|
 |[Nebraska](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/ne.py)||[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|html|
 |[New Jersey](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/nj.py)||[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|html|
+|[New Mexico](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/nm.py)||[chriszs](https://github.com/chriszs)|pdf|
 |[New York](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/ny.py)|[📃](scrapers/ny.md)|[Dilcia19](https://github.com/Dilcia19), [ydoc5212](https://github.com/ydoc5212), [zstumgoren](https://github.com/zstumgoren)|excel, historical|
 |[Ohio](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/oh.py)||[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|html|
 |[Oklahoma](https://github.com/biglocalnews/warn-scraper/blob/main/warn/scrapers/ok.py)|[📃](scrapers/ok.md)|[Dilcia19](https://github.com/Dilcia19), [zstumgoren](https://github.com/zstumgoren)|jobcenter|
@@ -39,7 +40,7 @@ There are currently scrapers for 31 of America's 56 states and territories.
 
 ## To do
 
-These 25 areas need a scraper:
+These 24 areas need a scraper:
 
 - Arkansas
 - Colorado
@@ -53,7 +54,6 @@ These 25 areas need a scraper:
 - Mississippi
 - Nevada
 - New Hampshire
-- New Mexico
 - North Carolina
 - North Dakota
 - Pennsylvania

From a5c8792ebd1e60356547511a7bbe10be4bf9c9ca Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sat, 29 Jan 2022 11:38:03 -0500
Subject: [PATCH 3/3] fix(nm): add blank lines in doc strings

---
 warn/scrapers/nm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/warn/scrapers/nm.py b/warn/scrapers/nm.py
index 0c189b4f..c2340ace 100644
--- a/warn/scrapers/nm.py
+++ b/warn/scrapers/nm.py
@@ -23,9 +23,11 @@ def scrape(
 ) -> Path:
     """
     Scrape data from New Mexico.
+
     Keyword arguments:
     data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
     cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
+
     Returns: the Path where the file is written
     """
     # Fire up the cache
@@ -94,8 +96,10 @@ def scrape(
 def _clean_text(text: str) -> str:
     """
     Clean up text from a PDF cell.
+
     Keyword arguments:
     text -- the text to clean
+
     Returns: the cleaned text
     """
     # Replace None with an empty string