From b5acb0df8c2cad740f05a06e1f85d8702fd9412d Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Thu, 20 Jan 2022 07:02:09 -0500
Subject: [PATCH 1/9] feat(id): add scraper Closes #82

---
 warn/scrapers/id.py | 81 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 warn/scrapers/id.py

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
new file mode 100644
index 00000000..a18c8ae0
--- /dev/null
+++ b/warn/scrapers/id.py
@@ -0,0 +1,81 @@
+import logging
+import random
+import re
+import typing
+from pathlib import Path
+
+import pdfplumber
+import requests
+
+from .. import utils
+
+logger = logging.getLogger(__name__)
+
+
+def scrape(
+    data_dir: Path = utils.WARN_DATA_DIR,
+    cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR,
+) -> Path:
+    """
+    Scrape data from Idaho.
+
+    Keyword arguments:
+    data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
+    cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
+
+    Returns: the Path where the file is written
+    """
+
+    # There's a numeric parameter called v on this PDF URL that updates
+    # from time to time. Suspect this is a cache-buster. We're using a
+    # random number instead.
+    cache_buster = random.randrange(1, 10000000000)
+
+    url = f"https://www.labor.idaho.gov/dnn/Portals/0/Publications/WARNNotice.pdf?v={cache_buster}"
+
+    cache_state = Path(cache_dir, "id")
+    cache_state.mkdir(parents=True, exist_ok=True)
+
+    pdf_file = f"{cache_state}/WARNNotice.pdf"
+
+    # verify=False because there's a persistent cert error
+    # we're working around.
+    response = requests.get(url, verify=False)
+    with open(pdf_file, "wb") as file:
+        file.write(response.content)
+
+    include_header = True
+
+    output_rows = []
+    with pdfplumber.open(pdf_file) as pdf:
+        for idx, page in enumerate(pdf.pages):
+            rows = page.extract_tables()[0]
+
+            for row in rows:
+                output_row = []
+                for column in row:
+                    if column is None:
+                        output_row.append("")
+                    else:
+                        # Collapse newlines
+                        partial = re.sub(r"\n", " ", column)
+                        # Standardize whitespace
+                        clean_text = re.sub(r"\s+", " ", partial)
+                        output_row.append(clean_text)
+
+                if len(output_row) > 0 and (
+                    output_row[0] != "Date of Letter" or include_header
+                ):
+                    output_rows.append(output_row)
+
+            include_header = False
+
+    # Write out the data to a CSV
+    data_path = data_dir / "id.csv"
+    utils.write_rows_to_csv(output_rows, data_path)
+
+    return data_path
+
+
+if __name__ == "__main__":
+    scrape()

From d2df46bc82abc8cfb49efe7a2db8cc006d6dc117 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 19:55:42 -0500
Subject: [PATCH 2/9] feat(id): improve scraper Fill down merged cells and
 other misc. code changes.

---
 warn/scrapers/id.py | 106 +++++++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 32 deletions(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index a18c8ae0..756f3f38 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -1,7 +1,6 @@
 import logging
 import random
 import re
-import typing
 from pathlib import Path
 
 import pdfplumber
@@ -9,12 +8,15 @@
 
 from .. import utils
 
+__authors__ = ["chriszs"]
+__tags__ = ["pdf"]
+
 logger = logging.getLogger(__name__)
 
 
 def scrape(
     data_dir: Path = utils.WARN_DATA_DIR,
-    cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR,
+    cache_dir: Path = utils.WARN_CACHE_DIR,
 ) -> Path:
     """
     Scrape data from Idaho.
@@ -26,56 +28,96 @@ def scrape(
     Returns: the Path where the file is written
     """
 
+    state_code = "id"
+    base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/"
+    file_name = "WARNNotice.pdf"
+
     # There's a numeric parameter called v on this PDF URL that updates
     # from time to time. Suspect this is a cache-buster. We're using a
     # random number instead.
-    cache_buster = random.randrange(1, 10000000000)
+    min_cache_buster = 0
+    max_cache_buster = 10000000000
+    cache_buster = random.randrange(min_cache_buster, max_cache_buster)
 
-    url = f"https://www.labor.idaho.gov/dnn/Portals/0/Publications/WARNNotice.pdf?v={cache_buster}"
+    url = f"{base_url}{file_name}?v={cache_buster}"
 
-    cache_state = Path(cache_dir, "id")
+    cache_state = Path(cache_dir, state_code)
     cache_state.mkdir(parents=True, exist_ok=True)
 
-    pdf_file = f"{cache_state}/WARNNotice.pdf"
+    cache_key = f"{cache_state}/WARNNotice.pdf"
 
     # verify=False because there's a persistent cert error
     # we're working around.
     response = requests.get(url, verify=False)
-    with open(pdf_file, "wb") as file:
+    with open(cache_key, "wb") as file:
         file.write(response.content)
 
-    include_header = True
-
     output_rows = []
-    with pdfplumber.open(pdf_file) as pdf:
-        for idx, page in enumerate(pdf.pages):
-            rows = page.extract_tables()[0]
-
-            for row in rows:
-                output_row = []
-                for column in row:
-                    if column is None:
-                        output_row.append("")
-                    else:
-                        # Collapse newlines
-                        partial = re.sub(r"\n", " ", column)
-                        # Standardize whitespace
-                        clean_text = re.sub(r"\s+", " ", partial)
-                        output_row.append(clean_text)
-
-                if len(output_row) > 0 and (
-                    output_row[0] != "Date of Letter" or include_header
-                ):
-                    output_rows.append(output_row)
-
-            include_header = False
+
+    with pdfplumber.open(cache_key) as pdf:
+        for index, page in enumerate(pdf.pages):
+            rows = page.extract_table()
+
+            output_rows = output_rows + _clean_table(rows, index)
 
     # Write out the data to a CSV
-    data_path = data_dir / "id.csv"
+    data_path = Path(data_dir, f"{state_code}.csv")
     utils.write_rows_to_csv(output_rows, data_path)
 
     return data_path
 
 
+def _clean_table(rows: list, page_index: int) -> list:
+    """
+    Clean up a table from a PDF.
+
+    Keyword arguments:
+    rows -- the rows of the table
+    page_index -- the index of the page
+
+    Returns: a list of lists, where each inner list is a row in the table
+    """
+    output_rows = []
+
+    for row_index, row in enumerate(rows):
+        output_row = []
+        for col_index, column in enumerate(row):
+            clean_text = _clean_text(column)
+
+            # If cell is empty, copy from the cell above it
+            # to deal with merged cells. Except for number of employees,
+            # which is effectively a total for all locations in the merged cell
+            # and which we don't want a data user to double count.
+            if (
+                clean_text == ""
+                and row_index > 0
+                and col_index < len(output_rows[row_index - 1])
+                and output_rows[0][col_index] != "No. of Employees Affected"
+            ):
+                clean_text = output_rows[row_index - 1][col_index]
+
+            output_row.append(clean_text)
+
+        output_rows.append(output_row)
+
+    # Only include the header on the first page
+    if page_index != 0:
+        return output_rows[1:]
+
+    return output_rows
+
+
+def _clean_text(text: str) -> str:
+    """
+    Clean up text from a PDF cell.
+    """
+    if text is None:
+        return ""
+    # Collapse newlines
+    partial = re.sub(r"\n", " ", text)
+    # Standardize whitespace
+    return re.sub(r"\s+", " ", partial)
+
+
 if __name__ == "__main__":
     scrape()

From 143a309535384a45f114dc8ccf661fddeed99e04 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sat, 22 Jan 2022 12:40:03 -0500
Subject: [PATCH 3/9] fix(id): fix linter errors

---
 warn/scrapers/id.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index 756f3f38..db122510 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -27,7 +27,6 @@ def scrape(
 
     Returns: the Path where the file is written
     """
-
     state_code = "id"
     base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/"
     file_name = "WARNNotice.pdf"
@@ -110,6 +109,11 @@ def _clean_table(rows: list, page_index: int) -> list:
 def _clean_text(text: str) -> str:
     """
     Clean up text from a PDF cell.
+
+    Keyword arguments:
+    text -- the text to clean
+
+    Returns: the cleaned text
     """
     if text is None:
         return ""

From b9a50c71a3a1dc1cce0ea74f337389fe93f110be Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sat, 22 Jan 2022 12:54:47 -0500
Subject: [PATCH 4/9] refactor(id): use file_name var in cache key

---
 warn/scrapers/id.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index db122510..d6dc38ae 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -43,7 +43,7 @@ def scrape(
     cache_state = Path(cache_dir, state_code)
     cache_state.mkdir(parents=True, exist_ok=True)
 
-    cache_key = f"{cache_state}/WARNNotice.pdf"
+    cache_key = f"{cache_state}/{file_name}"
 
     # verify=False because there's a persistent cert error
     # we're working around.

From 6f5d8a5b4782b479df0372fff513bd4af36cc05a Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sun, 23 Jan 2022 23:18:58 -0500
Subject: [PATCH 5/9] fix(id): add type annotation

---
 warn/scrapers/id.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index d6dc38ae..a78089fa 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -51,7 +51,7 @@ def scrape(
     with open(cache_key, "wb") as file:
         file.write(response.content)
 
-    output_rows = []
+    output_rows: list = []
 
     with pdfplumber.open(cache_key) as pdf:
         for index, page in enumerate(pdf.pages):
@@ -76,7 +76,7 @@ def _clean_table(rows: list, page_index: int) -> list:
 
     Returns: a list of lists, where each inner list is a row in the table
     """
-    output_rows = []
+    output_rows: list = []
 
     for row_index, row in enumerate(rows):
         output_row = []

From 8aceba82e048c2e6dc6e0847783ac108eb9083e2 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Thu, 27 Jan 2022 11:31:31 -0500
Subject: [PATCH 6/9] refactor(id): incorporate revisions from code review

Co-authored-by: Ben Welsh <b@palewi.re>
---
 warn/scrapers/id.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index a78089fa..a01b23c2 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -57,11 +57,11 @@ def scrape(
         for index, page in enumerate(pdf.pages):
             rows = page.extract_table()
 
-            output_rows = output_rows + _clean_table(rows, index)
+            output_rows +=  _clean_table(rows, index)
 
     # Write out the data to a CSV
-    data_path = Path(data_dir, f"{state_code}.csv")
-    utils.write_rows_to_csv(output_rows, data_path)
+    data_path = data_dir / f"{state_code}.csv"
+    utils.write_rows_to_csv(data_path, output_rows)
 
     return data_path
 

From 4dbae6572e023dbf41c3e729645dadae77bc4182 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Thu, 27 Jan 2022 12:08:45 -0500
Subject: [PATCH 7/9] feat(cache): add kwargs to download Allow arbitrary
 options passed to requests.get

---
 warn/cache.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/warn/cache.py b/warn/cache.py
index 5ac9b559..ef511cd1 100644
--- a/warn/cache.py
+++ b/warn/cache.py
@@ -71,20 +71,20 @@ def read_csv(self, name):
         with open(path) as fh:
             return list(csv.reader(fh))
 
-    def download(self, name: str, url: str) -> Path:
+    def download(self, name: str, url: str, **kwargs) -> Path:
         """
         Download the provided URL and save it in the cache.
 
         Args:
             name (str): The path where the file will be saved. Can be a simple string like "ia/data.xlsx"
             url (str): The URL to download
+            **kwargs: Additional arguments to pass to requests.get()
 
         Returns: The Path where the file was saved
         """
         # Request the URL
         logger.debug(f"Requesting {url}")
-        with requests.get(url, stream=True) as r:
-
+        with requests.get(url, stream=True, **kwargs) as r:
             # If there's no encoding, set it
             if r.encoding is None:
                 r.encoding = "utf-8"

From 2c1b0f38c1a309a12779e05c5ac2d42fe4fc09e9 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Thu, 27 Jan 2022 12:09:17 -0500
Subject: [PATCH 8/9] refactor(id): clean up scraper

---
 warn/scrapers/id.py | 66 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index a01b23c2..6847305d 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -4,9 +4,9 @@
 from pathlib import Path
 
 import pdfplumber
-import requests
 
 from .. import utils
+from ..cache import Cache
 
 __authors__ = ["chriszs"]
 __tags__ = ["pdf"]
@@ -27,6 +27,8 @@ def scrape(
 
     Returns: the Path where the file is written
     """
+    cache = Cache(cache_dir)
+
     state_code = "id"
     base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/"
     file_name = "WARNNotice.pdf"
@@ -40,24 +42,19 @@ def scrape(
 
     url = f"{base_url}{file_name}?v={cache_buster}"
 
-    cache_state = Path(cache_dir, state_code)
-    cache_state.mkdir(parents=True, exist_ok=True)
-
-    cache_key = f"{cache_state}/{file_name}"
+    cache_key = f"{state_code}/{file_name}"
 
     # verify=False because there's a persistent cert error
     # we're working around.
-    response = requests.get(url, verify=False)
-    with open(cache_key, "wb") as file:
-        file.write(response.content)
+    pdf_file = cache.download(cache_key, url, verify=False)
 
     output_rows: list = []
 
-    with pdfplumber.open(cache_key) as pdf:
+    with pdfplumber.open(pdf_file) as pdf:
         for index, page in enumerate(pdf.pages):
             rows = page.extract_table()
 
-            output_rows +=  _clean_table(rows, index)
+            output_rows += _clean_table(rows, index)
 
     # Write out the data to a CSV
     data_path = data_dir / f"{state_code}.csv"
@@ -88,10 +85,10 @@ def _clean_table(rows: list, page_index: int) -> list:
             # which is effectively a total for all locations in the merged cell
             # and which we don't want a data user to double count.
             if (
-                clean_text == ""
-                and row_index > 0
-                and col_index < len(output_rows[row_index - 1])
-                and output_rows[0][col_index] != "No. of Employees Affected"
+                _is_empty(clean_text)
+                and _column_exists_in_prior_row(row_index, col_index, output_rows)
+                and "No. of Employees"
+                not in _column_name_from_index(col_index, output_rows)
             ):
                 clean_text = output_rows[row_index - 1][col_index]
 
@@ -106,6 +103,47 @@ def _clean_table(rows: list, page_index: int) -> list:
     return output_rows
 
 
+def _is_empty(text: str) -> bool:
+    """
+    Determine if a cell is empty.
+
+    Keyword arguments:
+    text -- the text to check
+
+    Returns: True if the cell is empty, False otherwise
+    """
+    return text == ""
+
+
+def _column_exists_in_prior_row(
+    row_index: int, col_index: int, output_rows: list
+) -> bool:
+    """
+    Determine if a column exists in the prior row.
+
+    Keyword arguments:
+    row_index -- the index of the row
+    col_index -- the index of the column
+    output_rows -- the output rows
+
+    Returns: True if the column exists, False otherwise
+    """
+    return row_index > 0 and col_index < len(output_rows[row_index - 1])
+
+
+def _column_name_from_index(col_index: int, output_rows: list) -> str:
+    """
+    Determine the column name from the column index.
+
+    Keyword arguments:
+    col_index -- the index of the column
+    output_rows -- the output rows
+
+    Returns: the column name
+    """
+    return output_rows[0][col_index]
+
+
 def _clean_text(text: str) -> str:
     """
     Clean up text from a PDF cell.

From ff231bdc0228eb9e8ccf50a46d3d13a4bda4ee85 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Thu, 27 Jan 2022 12:14:39 -0500
Subject: [PATCH 9/9] refactor(id): adjust arg order in utils

---
 warn/scrapers/id.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index 6847305d..43454a67 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -86,9 +86,9 @@ def _clean_table(rows: list, page_index: int) -> list:
             # and which we don't want a data user to double count.
             if (
                 _is_empty(clean_text)
-                and _column_exists_in_prior_row(row_index, col_index, output_rows)
+                and _column_exists_in_prior_row(output_rows, row_index, col_index)
                 and "No. of Employees"
-                not in _column_name_from_index(col_index, output_rows)
+                not in _column_name_from_index(output_rows, col_index)
             ):
                 clean_text = output_rows[row_index - 1][col_index]
 
@@ -116,7 +116,7 @@ def _is_empty(text: str) -> bool:
 
 
 def _column_exists_in_prior_row(
-    row_index: int, col_index: int, output_rows: list
+    output_rows: list, row_index: int, col_index: int
 ) -> bool:
     """
     Determine if a column exists in the prior row.
@@ -131,7 +131,7 @@ def _column_exists_in_prior_row(
     return row_index > 0 and col_index < len(output_rows[row_index - 1])
 
 
-def _column_name_from_index(col_index: int, output_rows: list) -> str:
+def _column_name_from_index(output_rows: list, col_index: int) -> str:
     """
     Determine the column name from the column index.