From b5acb0df8c2cad740f05a06e1f85d8702fd9412d Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Thu, 20 Jan 2022 07:02:09 -0500 Subject: [PATCH 1/9] feat(id): add scraper Closes #82 --- warn/scrapers/id.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 warn/scrapers/id.py diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py new file mode 100644 index 00000000..a18c8ae0 --- /dev/null +++ b/warn/scrapers/id.py @@ -0,0 +1,81 @@ +import logging +import random +import re +import typing +from pathlib import Path + +import pdfplumber +import requests + +from .. import utils + +logger = logging.getLogger(__name__) + + +def scrape( + data_dir: Path = utils.WARN_DATA_DIR, + cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR, +) -> Path: + """ + Scrape data from Idaho. + + Keyword arguments: + data_dir -- the Path were the result will be saved (default WARN_DATA_DIR) + cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR) + + Returns: the Path where the file is written + """ + + # There's a numeric parameter called v on this PDF URL that updates + # from time to time. Suspect this is a cache-buster. We're using a + # random number instead. + cache_buster = random.randrange(1, 10000000000) + + url = f"https://www.labor.idaho.gov/dnn/Portals/0/Publications/WARNNotice.pdf?v={cache_buster}" + + cache_state = Path(cache_dir, "id") + cache_state.mkdir(parents=True, exist_ok=True) + + pdf_file = f"{cache_state}/WARNNotice.pdf" + + # verify=False because there's a persistent cert error + # we're working around. + response = requests.get(url, verify=False) + with open(pdf_file, "wb") as file: + file.write(response.content) + + include_header = True + + output_rows = [] + with pdfplumber.open(pdf_file) as pdf: + for idx, page in enumerate(pdf.pages): + rows = page.extract_tables()[0] + + for row in rows: + output_row = [] + for column in row: + if column is None: + output_row.append("") + else: + # Collapse newlines + partial = re.sub(r"\n", " ", column) + # Standardize whitespace + clean_text = re.sub(r"\s+", " ", partial) + output_row.append(clean_text) + + if len(output_row) > 0 and ( + output_row[0] != "Date of Letter" or include_header + ): + output_rows.append(output_row) + + include_header = False + + # Write out the data to a CSV + data_path = data_dir / "id.csv" + utils.write_rows_to_csv(output_rows, data_path) + + return data_path + + +if __name__ == "__main__": + scrape() From d2df46bc82abc8cfb49efe7a2db8cc006d6dc117 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 19:55:42 -0500 Subject: [PATCH 2/9] feat(id): improve scraper Fill down merged cells and other misc. code changes. --- warn/scrapers/id.py | 106 +++++++++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 32 deletions(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index a18c8ae0..756f3f38 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -1,7 +1,6 @@ import logging import random import re -import typing from pathlib import Path import pdfplumber @@ -9,12 +8,15 @@ from .. import utils +__authors__ = ["chriszs"] +__tags__ = ["pdf"] + logger = logging.getLogger(__name__) def scrape( data_dir: Path = utils.WARN_DATA_DIR, - cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR, + cache_dir: Path = utils.WARN_CACHE_DIR, ) -> Path: """ Scrape data from Idaho. @@ -26,56 +28,96 @@ def scrape( Returns: the Path where the file is written """ + state_code = "id" + base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/" + file_name = "WARNNotice.pdf" + # There's a numeric parameter called v on this PDF URL that updates # from time to time. Suspect this is a cache-buster. We're using a # random number instead. - cache_buster = random.randrange(1, 10000000000) + min_cache_buster = 0 + max_cache_buster = 10000000000 + cache_buster = random.randrange(min_cache_buster, max_cache_buster) - url = f"https://www.labor.idaho.gov/dnn/Portals/0/Publications/WARNNotice.pdf?v={cache_buster}" + url = f"{base_url}{file_name}?v={cache_buster}" - cache_state = Path(cache_dir, "id") + cache_state = Path(cache_dir, state_code) cache_state.mkdir(parents=True, exist_ok=True) - pdf_file = f"{cache_state}/WARNNotice.pdf" + cache_key = f"{cache_state}/WARNNotice.pdf" # verify=False because there's a persistent cert error # we're working around. response = requests.get(url, verify=False) - with open(pdf_file, "wb") as file: + with open(cache_key, "wb") as file: file.write(response.content) - include_header = True - output_rows = [] - with pdfplumber.open(pdf_file) as pdf: - for idx, page in enumerate(pdf.pages): - rows = page.extract_tables()[0] - - for row in rows: - output_row = [] - for column in row: - if column is None: - output_row.append("") - else: - # Collapse newlines - partial = re.sub(r"\n", " ", column) - # Standardize whitespace - clean_text = re.sub(r"\s+", " ", partial) - output_row.append(clean_text) - - if len(output_row) > 0 and ( - output_row[0] != "Date of Letter" or include_header - ): - output_rows.append(output_row) - - include_header = False + + with pdfplumber.open(cache_key) as pdf: + for index, page in enumerate(pdf.pages): + rows = page.extract_table() + + output_rows = output_rows + _clean_table(rows, index) # Write out the data to a CSV - data_path = data_dir / "id.csv" + data_path = Path(data_dir, f"{state_code}.csv") utils.write_rows_to_csv(output_rows, data_path) return data_path +def _clean_table(rows: list, page_index: int) -> list: + """ + Clean up a table from a PDF. + + Keyword arguments: + rows -- the rows of the table + page_index -- the index of the page + + Returns: a list of lists, where each inner list is a row in the table + """ + output_rows = [] + + for row_index, row in enumerate(rows): + output_row = [] + for col_index, column in enumerate(row): + clean_text = _clean_text(column) + + # If cell is empty, copy from the cell above it + # to deal with merged cells. Except for number of employees, + # which is effectively a total for all locations in the merged cell + # and which we don't want a data user to double count. + if ( + clean_text == "" + and row_index > 0 + and col_index < len(output_rows[row_index - 1]) + and output_rows[0][col_index] != "No. of Employees Affected" + ): + clean_text = output_rows[row_index - 1][col_index] + + output_row.append(clean_text) + + output_rows.append(output_row) + + # Only include the header on the first page + if page_index != 0: + return output_rows[1:] + + return output_rows + + +def _clean_text(text: str) -> str: + """ + Clean up text from a PDF cell. + """ + if text is None: + return "" + # Collapse newlines + partial = re.sub(r"\n", " ", text) + # Standardize whitespace + return re.sub(r"\s+", " ", partial) + + if __name__ == "__main__": scrape() From 143a309535384a45f114dc8ccf661fddeed99e04 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sat, 22 Jan 2022 12:40:03 -0500 Subject: [PATCH 3/9] fix(id): fix linter errors --- warn/scrapers/id.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index 756f3f38..db122510 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -27,7 +27,6 @@ def scrape( Returns: the Path where the file is written """ - state_code = "id" base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/" file_name = "WARNNotice.pdf" @@ -110,6 +109,11 @@ def _clean_table(rows: list, page_index: int) -> list: def _clean_text(text: str) -> str: """ Clean up text from a PDF cell. + + Keyword arguments: + text -- the text to clean + + Returns: the cleaned text """ if text is None: return "" From b9a50c71a3a1dc1cce0ea74f337389fe93f110be Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sat, 22 Jan 2022 12:54:47 -0500 Subject: [PATCH 4/9] refactor(id): use file_name var in cache key --- warn/scrapers/id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index db122510..d6dc38ae 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -43,7 +43,7 @@ def scrape( cache_state = Path(cache_dir, state_code) cache_state.mkdir(parents=True, exist_ok=True) - cache_key = f"{cache_state}/WARNNotice.pdf" + cache_key = f"{cache_state}/{file_name}" # verify=False because there's a persistent cert error # we're working around. From 6f5d8a5b4782b479df0372fff513bd4af36cc05a Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sun, 23 Jan 2022 23:18:58 -0500 Subject: [PATCH 5/9] fix(id): add type annotation --- warn/scrapers/id.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index d6dc38ae..a78089fa 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -51,7 +51,7 @@ def scrape( with open(cache_key, "wb") as file: file.write(response.content) - output_rows = [] + output_rows: list = [] with pdfplumber.open(cache_key) as pdf: for index, page in enumerate(pdf.pages): @@ -76,7 +76,7 @@ def _clean_table(rows: list, page_index: int) -> list: Returns: a list of lists, where each inner list is a row in the table """ - output_rows = [] + output_rows: list = [] for row_index, row in enumerate(rows): output_row = [] From 8aceba82e048c2e6dc6e0847783ac108eb9083e2 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Thu, 27 Jan 2022 11:31:31 -0500 Subject: [PATCH 6/9] refactor(id): incorporate revisions from code review Co-authored-by: Ben Welsh --- warn/scrapers/id.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index a78089fa..a01b23c2 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -57,11 +57,11 @@ def scrape( for index, page in enumerate(pdf.pages): rows = page.extract_table() - output_rows = output_rows + _clean_table(rows, index) + output_rows += _clean_table(rows, index) # Write out the data to a CSV - data_path = Path(data_dir, f"{state_code}.csv") - utils.write_rows_to_csv(output_rows, data_path) + data_path = data_dir / f"{state_code}.csv" + utils.write_rows_to_csv(data_path, output_rows) return data_path From 4dbae6572e023dbf41c3e729645dadae77bc4182 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Thu, 27 Jan 2022 12:08:45 -0500 Subject: [PATCH 7/9] feat(cache): add kwargs to download Allow arbitrary options passed to requests.get --- warn/cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warn/cache.py b/warn/cache.py index 5ac9b559..ef511cd1 100644 --- a/warn/cache.py +++ b/warn/cache.py @@ -71,20 +71,20 @@ def read_csv(self, name): with open(path) as fh: return list(csv.reader(fh)) - def download(self, name: str, url: str) -> Path: + def download(self, name: str, url: str, **kwargs) -> Path: """ Download the provided URL and save it in the cache. Args: name (str): The path where the file will be saved. Can be a simple string like "ia/data.xlsx" url (str): The URL to download + **kwargs: Additional arguments to pass to requests.get() Returns: The Path where the file was saved """ # Request the URL logger.debug(f"Requesting {url}") - with requests.get(url, stream=True) as r: - + with requests.get(url, stream=True, **kwargs) as r: # If there's no encoding, set it if r.encoding is None: r.encoding = "utf-8" From 2c1b0f38c1a309a12779e05c5ac2d42fe4fc09e9 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Thu, 27 Jan 2022 12:09:17 -0500 Subject: [PATCH 8/9] refactor(id): clean up scraper --- warn/scrapers/id.py | 66 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index a01b23c2..6847305d 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -4,9 +4,9 @@ from pathlib import Path import pdfplumber -import requests from .. import utils +from ..cache import Cache __authors__ = ["chriszs"] __tags__ = ["pdf"] @@ -27,6 +27,8 @@ def scrape( Returns: the Path where the file is written """ + cache = Cache(cache_dir) + state_code = "id" base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/" file_name = "WARNNotice.pdf" @@ -40,24 +42,19 @@ def scrape( url = f"{base_url}{file_name}?v={cache_buster}" - cache_state = Path(cache_dir, state_code) - cache_state.mkdir(parents=True, exist_ok=True) - - cache_key = f"{cache_state}/{file_name}" + cache_key = f"{state_code}/{file_name}" # verify=False because there's a persistent cert error # we're working around. - response = requests.get(url, verify=False) - with open(cache_key, "wb") as file: - file.write(response.content) + pdf_file = cache.download(cache_key, url, verify=False) output_rows: list = [] - with pdfplumber.open(cache_key) as pdf: + with pdfplumber.open(pdf_file) as pdf: for index, page in enumerate(pdf.pages): rows = page.extract_table() - output_rows += _clean_table(rows, index) + output_rows += _clean_table(rows, index) # Write out the data to a CSV data_path = data_dir / f"{state_code}.csv" @@ -88,10 +85,10 @@ def _clean_table(rows: list, page_index: int) -> list: # which is effectively a total for all locations in the merged cell # and which we don't want a data user to double count. if ( - clean_text == "" - and row_index > 0 - and col_index < len(output_rows[row_index - 1]) - and output_rows[0][col_index] != "No. of Employees Affected" + _is_empty(clean_text) + and _column_exists_in_prior_row(row_index, col_index, output_rows) + and "No. of Employees" + not in _column_name_from_index(col_index, output_rows) ): clean_text = output_rows[row_index - 1][col_index] @@ -106,6 +103,47 @@ def _clean_table(rows: list, page_index: int) -> list: return output_rows +def _is_empty(text: str) -> bool: + """ + Determine if a cell is empty. + + Keyword arguments: + text -- the text to check + + Returns: True if the cell is empty, False otherwise + """ + return text == "" + + +def _column_exists_in_prior_row( + row_index: int, col_index: int, output_rows: list +) -> bool: + """ + Determine if a column exists in the prior row. + + Keyword arguments: + row_index -- the index of the row + col_index -- the index of the column + output_rows -- the output rows + + Returns: True if the column exists, False otherwise + """ + return row_index > 0 and col_index < len(output_rows[row_index - 1]) + + +def _column_name_from_index(col_index: int, output_rows: list) -> str: + """ + Determine the column name from the column index. + + Keyword arguments: + col_index -- the index of the column + output_rows -- the output rows + + Returns: the column name + """ + return output_rows[0][col_index] + + def _clean_text(text: str) -> str: """ Clean up text from a PDF cell. From ff231bdc0228eb9e8ccf50a46d3d13a4bda4ee85 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Thu, 27 Jan 2022 12:14:39 -0500 Subject: [PATCH 9/9] refactor(id): adjust arg order in utils --- warn/scrapers/id.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index 6847305d..43454a67 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -86,9 +86,9 @@ def _clean_table(rows: list, page_index: int) -> list: # and which we don't want a data user to double count. if ( _is_empty(clean_text) - and _column_exists_in_prior_row(row_index, col_index, output_rows) + and _column_exists_in_prior_row(output_rows, row_index, col_index) and "No. of Employees" - not in _column_name_from_index(col_index, output_rows) + not in _column_name_from_index(output_rows, col_index) ): clean_text = output_rows[row_index - 1][col_index] @@ -116,7 +116,7 @@ def _is_empty(text: str) -> bool: def _column_exists_in_prior_row( - row_index: int, col_index: int, output_rows: list + output_rows: list, row_index: int, col_index: int ) -> bool: """ Determine if a column exists in the prior row. @@ -131,7 +131,7 @@ def _column_exists_in_prior_row( return row_index > 0 and col_index < len(output_rows[row_index - 1]) -def _column_name_from_index(col_index: int, output_rows: list) -> str: +def _column_name_from_index(output_rows: list, col_index: int) -> str: """ Determine the column name from the column index.