From b797433c101c639afb086d7f819561eb51d7f486 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Wed, 19 Jan 2022 19:50:06 -0500 Subject: [PATCH 01/12] feat(ga): add scraper Closes #63 --- warn/scrapers/ga.py | 80 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 warn/scrapers/ga.py diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py new file mode 100644 index 00000000..1950acdf --- /dev/null +++ b/warn/scrapers/ga.py @@ -0,0 +1,80 @@ +import re +import typing +from datetime import datetime +from pathlib import Path + +from bs4 import BeautifulSoup + +from .. import utils + + +def scrape( + data_dir: Path = utils.WARN_DATA_DIR, + cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR, +) -> Path: + """ + Scrape data from Georgia. + + Keyword arguments: + data_dir -- the Path were the result will be saved (default WARN_DATA_DIR) + cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR) + + Returns: the Path where the file is written + """ + area = 9 # statewide + + current_year = datetime.now().year + first_year = 2002 # first available year + + years = list(range(first_year, current_year)).reverse() + + # include column headers in first row + column_tags = ["td", "th"] + + output_rows = [] + + for year in years: + url = f"https://www.dol.state.ga.us/public/es/warn/searchwarns/list?geoArea={area}&year={year}&step=search" + + # Get URL + page = utils.get_url(url) + + # Force encoding to fix dashes, apostrophes, etc. on page.text from requests reponse + page.encoding = "utf-8" + + # Parse out data table + soup = BeautifulSoup(page.text, "html.parser") + table = soup.find_all(id="emplrList") # output is list-type + + # Loop through the table and grab the data + for table_row in table[0].find_all("tr"): + columns = table_row.find_all(column_tags) + output_row = [] + + for column in columns: + # Collapse newlines + partial = re.sub(r"\n", " ", column.text) + # Standardize whitespace + clean_text = re.sub(r"\s+", " ", partial) + output_row.append(clean_text) + + output_row = [x.strip() for x in output_row] + + if len(output_row) == 0 or output_row == [""]: + continue + + output_rows.append(output_row) + + # exclude column headers on subsequent years + column_tags = ["td"] + + # Write out the data to a CSV + data_path = data_dir / "ga.csv" + utils.write_rows_to_csv(output_rows, data_path) + + # Return the Path to the CSV + return data_path + + +if __name__ == "__main__": + scrape() From a0c509537599b8b90dcf9034a77dcf2420d249d2 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Thu, 20 Jan 2022 05:56:52 -0500 Subject: [PATCH 02/12] fix(ga): reverse year list after defining it --- warn/scrapers/ga.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index 1950acdf..b50812d0 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -26,7 +26,8 @@ def scrape( current_year = datetime.now().year first_year = 2002 # first available year - years = list(range(first_year, current_year)).reverse() + years = list(range(first_year, current_year)) + years.reverse() # include column headers in first row column_tags = ["td", "th"] From e1cdb66cc98a3d0994b9aff24e93b68d32be603e Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 04:44:49 -0500 Subject: [PATCH 03/12] feat(ga): add caching --- warn/scrapers/ga.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index b50812d0..8a3e9032 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup from .. import utils +from ..cache import Cache def scrape( @@ -39,12 +40,13 @@ def scrape( # Get URL page = utils.get_url(url) + html = page.text - # Force encoding to fix dashes, apostrophes, etc. on page.text from requests reponse - page.encoding = "utf-8" + cache = Cache(cache_dir) + cache.write(f"ga/{year}.html", html) # Parse out data table - soup = BeautifulSoup(page.text, "html.parser") + soup = BeautifulSoup(html, "html.parser") table = soup.find_all(id="emplrList") # output is list-type # Loop through the table and grab the data @@ -70,7 +72,7 @@ def scrape( column_tags = ["td"] # Write out the data to a CSV - data_path = data_dir / "ga.csv" + data_path = f"{data_dir}/ga.csv" utils.write_rows_to_csv(output_rows, data_path) # Return the Path to the CSV From d1667064cc4a13ef30395dec58a0cc51b407ccdc Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 05:07:41 -0500 Subject: [PATCH 04/12] fix(ga): include current year --- warn/scrapers/ga.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index 8a3e9032..6da47e9f 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -27,7 +27,7 @@ def scrape( current_year = datetime.now().year first_year = 2002 # first available year - years = list(range(first_year, current_year)) + years = list(range(first_year, current_year + 1)) years.reverse() # include column headers in first row From 6be173838755deb5f9a3aa98c146da84fbe8bf8a Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 05:08:02 -0500 Subject: [PATCH 05/12] feat(ga): read from cache for prior years --- warn/scrapers/ga.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index 6da47e9f..dce271ef 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -22,6 +22,8 @@ def scrape( Returns: the Path where the file is written """ + cache = Cache(cache_dir) + area = 9 # statewide current_year = datetime.now().year @@ -38,12 +40,15 @@ def scrape( for year in years: url = f"https://www.dol.state.ga.us/public/es/warn/searchwarns/list?geoArea={area}&year={year}&step=search" - # Get URL - page = utils.get_url(url) - html = page.text + cache_key = f"ga/{year}.html" - cache = Cache(cache_dir) - cache.write(f"ga/{year}.html", html) + if cache.exists(cache_key) and year < current_year: + html = cache.read(cache_key) + else: + # Get URL + page = utils.get_url(url) + html = page.text + cache.write(cache_key, html) # Parse out data table soup = BeautifulSoup(html, "html.parser") From 50935f638b68bb161dc1efb7365800f58498e2ae Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 11:41:59 -0500 Subject: [PATCH 06/12] refactor: apply suggestions from code review Co-authored-by: Ben Welsh --- warn/scrapers/ga.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index dce271ef..a68bdd8f 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -1,5 +1,4 @@ import re -import typing from datetime import datetime from pathlib import Path @@ -11,7 +10,7 @@ def scrape( data_dir: Path = utils.WARN_DATA_DIR, - cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR, + cache_dir: Path = utils.WARN_CACHE_DIR, ) -> Path: """ Scrape data from Georgia. @@ -52,10 +51,14 @@ def scrape( # Parse out data table soup = BeautifulSoup(html, "html.parser") - table = soup.find_all(id="emplrList") # output is list-type + table_list = soup.find_all(id="emplrList") # output is list-type + + # We expect the first table to be there with our data + assert len(table_list) > 0 + table = table_list[0] # Loop through the table and grab the data - for table_row in table[0].find_all("tr"): + for table_row in table.find_all("tr"): columns = table_row.find_all(column_tags) output_row = [] @@ -63,11 +66,9 @@ def scrape( # Collapse newlines partial = re.sub(r"\n", " ", column.text) # Standardize whitespace - clean_text = re.sub(r"\s+", " ", partial) + clean_text = re.sub(r"\s+", " ", partial).strip() output_row.append(clean_text) - - output_row = [x.strip() for x in output_row] - + # Skip any empty rows if len(output_row) == 0 or output_row == [""]: continue From ab95d3f23307700409b67b78d5e8558c6fa47c17 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 12:05:01 -0500 Subject: [PATCH 07/12] refactor(ga): move table parsing into function --- warn/scrapers/ga.py | 75 +++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index a68bdd8f..ac220a2e 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -8,6 +8,42 @@ from ..cache import Cache +def parse_table(html, id, include_headers=False): + # Parse out data table + soup = BeautifulSoup(html, "html.parser") + table_list = soup.find_all(id=id) # output is list-type + + # We expect the first table to be there with our data + assert len(table_list) > 0 + table = table_list[0] + + output_rows = [] + column_tags = ["td"] + + if include_headers: + column_tags.append("th") + + # Loop through the table and grab the data + for table_row in table.find_all("tr"): + columns = table_row.find_all(column_tags) + output_row = [] + + for column in columns: + # Collapse newlines + partial = re.sub(r"\n", " ", column.text) + # Standardize whitespace + clean_text = re.sub(r"\s+", " ", partial).strip() + output_row.append(clean_text) + + # Skip any empty rows + if len(output_row) == 0 or output_row == [""]: + continue + + output_rows.append(output_row) + + return output_rows + + def scrape( data_dir: Path = utils.WARN_DATA_DIR, cache_dir: Path = utils.WARN_CACHE_DIR, @@ -21,6 +57,8 @@ def scrape( Returns: the Path where the file is written """ + base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list" + cache = Cache(cache_dir) area = 9 # statewide @@ -31,13 +69,12 @@ def scrape( years = list(range(first_year, current_year + 1)) years.reverse() - # include column headers in first row - column_tags = ["td", "th"] + include_headers = True output_rows = [] for year in years: - url = f"https://www.dol.state.ga.us/public/es/warn/searchwarns/list?geoArea={area}&year={year}&step=search" + url = f"{base_url}?geoArea={area}&year={year}&step=search" cache_key = f"ga/{year}.html" @@ -49,33 +86,11 @@ def scrape( html = page.text cache.write(cache_key, html) - # Parse out data table - soup = BeautifulSoup(html, "html.parser") - table_list = soup.find_all(id="emplrList") # output is list-type - - # We expect the first table to be there with our data - assert len(table_list) > 0 - table = table_list[0] - - # Loop through the table and grab the data - for table_row in table.find_all("tr"): - columns = table_row.find_all(column_tags) - output_row = [] - - for column in columns: - # Collapse newlines - partial = re.sub(r"\n", " ", column.text) - # Standardize whitespace - clean_text = re.sub(r"\s+", " ", partial).strip() - output_row.append(clean_text) - # Skip any empty rows - if len(output_row) == 0 or output_row == [""]: - continue - - output_rows.append(output_row) - - # exclude column headers on subsequent years - column_tags = ["td"] + new_rows = parse_table(html, "emplrList", include_headers=include_headers) + + output_rows = output_rows + new_rows + + include_headers = False # Write out the data to a CSV data_path = f"{data_dir}/ga.csv" From 0a15dc563b032ffce40967d62171640f348dccce Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 12:08:15 -0500 Subject: [PATCH 08/12] refactor(ga): tiny cosmetic changes --- warn/scrapers/ga.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index ac220a2e..c1573eb5 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -75,13 +75,11 @@ def scrape( for year in years: url = f"{base_url}?geoArea={area}&year={year}&step=search" - cache_key = f"ga/{year}.html" if cache.exists(cache_key) and year < current_year: html = cache.read(cache_key) else: - # Get URL page = utils.get_url(url) html = page.text cache.write(cache_key, html) From 46a670f46d333dfb798e6307562f5b8307476aad Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 12:11:18 -0500 Subject: [PATCH 09/12] docs(ga): add docstring to parse_table --- warn/scrapers/ga.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index c1573eb5..6836861e 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -8,7 +8,18 @@ from ..cache import Cache -def parse_table(html, id, include_headers=False): +def parse_table(html, id, include_headers=True): + """ + Parse HTML table with given ID. + + Keyword arguments: + html -- the HTML to parse + id -- the ID of the table to parse + include_headers -- whether to include the headers in the output (default True) + + Returns: a list of rows + """ + # Parse out data table soup = BeautifulSoup(html, "html.parser") table_list = soup.find_all(id=id) # output is list-type @@ -57,6 +68,7 @@ def scrape( Returns: the Path where the file is written """ + base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list" cache = Cache(cache_dir) From 1f47801940161006568fc12a12b39883864fff75 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 12:15:18 -0500 Subject: [PATCH 10/12] refactor(ga): define and reuse state_code --- warn/scrapers/ga.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index 6836861e..62ac4f61 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -69,6 +69,7 @@ def scrape( Returns: the Path where the file is written """ + state_code = "ga" base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list" cache = Cache(cache_dir) @@ -87,7 +88,7 @@ def scrape( for year in years: url = f"{base_url}?geoArea={area}&year={year}&step=search" - cache_key = f"ga/{year}.html" + cache_key = f"{state_code}/{year}.html" if cache.exists(cache_key) and year < current_year: html = cache.read(cache_key) @@ -103,7 +104,7 @@ def scrape( include_headers = False # Write out the data to a CSV - data_path = f"{data_dir}/ga.csv" + data_path = f"{data_dir}/{state_code}.csv" utils.write_rows_to_csv(output_rows, data_path) # Return the Path to the CSV From 3313aea9e2a943f6e36cc0383e6c73763ff20b1f Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 12:20:54 -0500 Subject: [PATCH 11/12] ga(refactor): move variables, modify comments --- warn/scrapers/ga.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index 62ac4f61..e792c9fb 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -69,10 +69,11 @@ def scrape( Returns: the Path where the file is written """ + cache = Cache(cache_dir) + state_code = "ga" base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list" - - cache = Cache(cache_dir) + data_path = f"{data_dir}/{state_code}.csv" area = 9 # statewide @@ -103,11 +104,9 @@ def scrape( include_headers = False - # Write out the data to a CSV - data_path = f"{data_dir}/{state_code}.csv" utils.write_rows_to_csv(output_rows, data_path) - # Return the Path to the CSV + # Return the path to the CSV return data_path From 9e30a18be344ceaaa34af1247eed91adb1a32574 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Fri, 21 Jan 2022 12:22:57 -0500 Subject: [PATCH 12/12] feat(ga): also get prior year regardless of cache --- warn/scrapers/ga.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py index e792c9fb..751c8aae 100644 --- a/warn/scrapers/ga.py +++ b/warn/scrapers/ga.py @@ -91,7 +91,8 @@ def scrape( url = f"{base_url}?geoArea={area}&year={year}&step=search" cache_key = f"{state_code}/{year}.html" - if cache.exists(cache_key) and year < current_year: + # Read from cache if available and not this year or the year before + if cache.exists(cache_key) and year < current_year - 1: html = cache.read(cache_key) else: page = utils.get_url(url)