From b797433c101c639afb086d7f819561eb51d7f486 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Wed, 19 Jan 2022 19:50:06 -0500
Subject: [PATCH 01/12] feat(ga): add scraper Closes #63

---
 warn/scrapers/ga.py | 80 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 warn/scrapers/ga.py

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
new file mode 100644
index 00000000..1950acdf
--- /dev/null
+++ b/warn/scrapers/ga.py
@@ -0,0 +1,80 @@
+import re
+import typing
+from datetime import datetime
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+
+
+def scrape(
+    data_dir: Path = utils.WARN_DATA_DIR,
+    cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR,
+) -> Path:
+    """
+    Scrape data from Georgia.
+
+    Keyword arguments:
+    data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
+    cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
+
+    Returns: the Path where the file is written
+    """
+    area = 9  # statewide
+
+    current_year = datetime.now().year
+    first_year = 2002  # first available year
+
+    years = list(range(first_year, current_year)).reverse()
+
+    # include column headers in first row
+    column_tags = ["td", "th"]
+
+    output_rows = []
+
+    for year in years:
+        url = f"https://www.dol.state.ga.us/public/es/warn/searchwarns/list?geoArea={area}&year={year}&step=search"
+
+        # Get URL
+        page = utils.get_url(url)
+
+        # Force encoding to fix dashes, apostrophes, etc. on page.text from requests reponse
+        page.encoding = "utf-8"
+
+        # Parse out data table
+        soup = BeautifulSoup(page.text, "html.parser")
+        table = soup.find_all(id="emplrList")  # output is list-type
+
+        # Loop through the table and grab the data
+        for table_row in table[0].find_all("tr"):
+            columns = table_row.find_all(column_tags)
+            output_row = []
+
+            for column in columns:
+                # Collapse newlines
+                partial = re.sub(r"\n", " ", column.text)
+                # Standardize whitespace
+                clean_text = re.sub(r"\s+", " ", partial)
+                output_row.append(clean_text)
+
+            output_row = [x.strip() for x in output_row]
+
+            if len(output_row) == 0 or output_row == [""]:
+                continue
+
+            output_rows.append(output_row)
+
+        # exclude column headers on subsequent years
+        column_tags = ["td"]
+
+    # Write out the data to a CSV
+    data_path = data_dir / "ga.csv"
+    utils.write_rows_to_csv(output_rows, data_path)
+
+    # Return the Path to the CSV
+    return data_path
+
+
+if __name__ == "__main__":
+    scrape()

From a0c509537599b8b90dcf9034a77dcf2420d249d2 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Thu, 20 Jan 2022 05:56:52 -0500
Subject: [PATCH 02/12] fix(ga): reverse year list after defining it

---
 warn/scrapers/ga.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index 1950acdf..b50812d0 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -26,7 +26,8 @@ def scrape(
     current_year = datetime.now().year
     first_year = 2002  # first available year
 
-    years = list(range(first_year, current_year)).reverse()
+    years = list(range(first_year, current_year))
+    years.reverse()
 
     # include column headers in first row
     column_tags = ["td", "th"]

From e1cdb66cc98a3d0994b9aff24e93b68d32be603e Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 04:44:49 -0500
Subject: [PATCH 03/12] feat(ga): add caching

---
 warn/scrapers/ga.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index b50812d0..8a3e9032 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -6,6 +6,7 @@
 from bs4 import BeautifulSoup
 
 from .. import utils
+from ..cache import Cache
 
 
 def scrape(
@@ -39,12 +40,13 @@ def scrape(
 
         # Get URL
         page = utils.get_url(url)
+        html = page.text
 
-        # Force encoding to fix dashes, apostrophes, etc. on page.text from requests reponse
-        page.encoding = "utf-8"
+        cache = Cache(cache_dir)
+        cache.write(f"ga/{year}.html", html)
 
         # Parse out data table
-        soup = BeautifulSoup(page.text, "html.parser")
+        soup = BeautifulSoup(html, "html.parser")
         table = soup.find_all(id="emplrList")  # output is list-type
 
         # Loop through the table and grab the data
@@ -70,7 +72,7 @@ def scrape(
         column_tags = ["td"]
 
     # Write out the data to a CSV
-    data_path = data_dir / "ga.csv"
+    data_path = f"{data_dir}/ga.csv"
     utils.write_rows_to_csv(output_rows, data_path)
 
     # Return the Path to the CSV

From d1667064cc4a13ef30395dec58a0cc51b407ccdc Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 05:07:41 -0500
Subject: [PATCH 04/12] fix(ga): include current year

---
 warn/scrapers/ga.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index 8a3e9032..6da47e9f 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -27,7 +27,7 @@ def scrape(
     current_year = datetime.now().year
     first_year = 2002  # first available year
 
-    years = list(range(first_year, current_year))
+    years = list(range(first_year, current_year + 1))
     years.reverse()
 
     # include column headers in first row

From 6be173838755deb5f9a3aa98c146da84fbe8bf8a Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 05:08:02 -0500
Subject: [PATCH 05/12] feat(ga): read from cache for prior years

---
 warn/scrapers/ga.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index 6da47e9f..dce271ef 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -22,6 +22,8 @@ def scrape(
 
     Returns: the Path where the file is written
     """
+    cache = Cache(cache_dir)
+
     area = 9  # statewide
 
     current_year = datetime.now().year
@@ -38,12 +40,15 @@ def scrape(
     for year in years:
         url = f"https://www.dol.state.ga.us/public/es/warn/searchwarns/list?geoArea={area}&year={year}&step=search"
 
-        # Get URL
-        page = utils.get_url(url)
-        html = page.text
+        cache_key = f"ga/{year}.html"
 
-        cache = Cache(cache_dir)
-        cache.write(f"ga/{year}.html", html)
+        if cache.exists(cache_key) and year < current_year:
+            html = cache.read(cache_key)
+        else:
+            # Get URL
+            page = utils.get_url(url)
+            html = page.text
+            cache.write(cache_key, html)
 
         # Parse out data table
         soup = BeautifulSoup(html, "html.parser")

From 50935f638b68bb161dc1efb7365800f58498e2ae Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 11:41:59 -0500
Subject: [PATCH 06/12] refactor: apply suggestions from code review

Co-authored-by: Ben Welsh <b@palewi.re>
---
 warn/scrapers/ga.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index dce271ef..a68bdd8f 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -1,5 +1,4 @@
 import re
-import typing
 from datetime import datetime
 from pathlib import Path
 
@@ -11,7 +10,7 @@
 
 def scrape(
     data_dir: Path = utils.WARN_DATA_DIR,
-    cache_dir: typing.Optional[Path] = utils.WARN_CACHE_DIR,
+    cache_dir: Path = utils.WARN_CACHE_DIR,
 ) -> Path:
     """
     Scrape data from Georgia.
@@ -52,10 +51,14 @@ def scrape(
 
         # Parse out data table
         soup = BeautifulSoup(html, "html.parser")
-        table = soup.find_all(id="emplrList")  # output is list-type
+        table_list = soup.find_all(id="emplrList")  # output is list-type
+        
+        # We expect the first table to be there with our data
+        assert len(table_list) > 0
+        table = table_list[0]
 
         # Loop through the table and grab the data
-        for table_row in table[0].find_all("tr"):
+        for table_row in table.find_all("tr"):
             columns = table_row.find_all(column_tags)
             output_row = []
 
@@ -63,11 +66,9 @@ def scrape(
                 # Collapse newlines
                 partial = re.sub(r"\n", " ", column.text)
                 # Standardize whitespace
-                clean_text = re.sub(r"\s+", " ", partial)
+                clean_text = re.sub(r"\s+", " ", partial).strip()
                 output_row.append(clean_text)
-
-            output_row = [x.strip() for x in output_row]
-
+            # Skip any empty rows
             if len(output_row) == 0 or output_row == [""]:
                 continue
 

From ab95d3f23307700409b67b78d5e8558c6fa47c17 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 12:05:01 -0500
Subject: [PATCH 07/12] refactor(ga): move table parsing into function

---
 warn/scrapers/ga.py | 75 +++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 30 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index a68bdd8f..ac220a2e 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -8,6 +8,42 @@
 from ..cache import Cache
 
 
+def parse_table(html, id, include_headers=False):
+    # Parse out data table
+    soup = BeautifulSoup(html, "html.parser")
+    table_list = soup.find_all(id=id)  # output is list-type
+
+    # We expect the first table to be there with our data
+    assert len(table_list) > 0
+    table = table_list[0]
+
+    output_rows = []
+    column_tags = ["td"]
+
+    if include_headers:
+        column_tags.append("th")
+
+    # Loop through the table and grab the data
+    for table_row in table.find_all("tr"):
+        columns = table_row.find_all(column_tags)
+        output_row = []
+
+        for column in columns:
+            # Collapse newlines
+            partial = re.sub(r"\n", " ", column.text)
+            # Standardize whitespace
+            clean_text = re.sub(r"\s+", " ", partial).strip()
+            output_row.append(clean_text)
+
+        # Skip any empty rows
+        if len(output_row) == 0 or output_row == [""]:
+            continue
+
+        output_rows.append(output_row)
+
+    return output_rows
+
+
 def scrape(
     data_dir: Path = utils.WARN_DATA_DIR,
     cache_dir: Path = utils.WARN_CACHE_DIR,
@@ -21,6 +57,8 @@ def scrape(
 
     Returns: the Path where the file is written
     """
+    base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list"
+
     cache = Cache(cache_dir)
 
     area = 9  # statewide
@@ -31,13 +69,12 @@ def scrape(
     years = list(range(first_year, current_year + 1))
     years.reverse()
 
-    # include column headers in first row
-    column_tags = ["td", "th"]
+    include_headers = True
 
     output_rows = []
 
     for year in years:
-        url = f"https://www.dol.state.ga.us/public/es/warn/searchwarns/list?geoArea={area}&year={year}&step=search"
+        url = f"{base_url}?geoArea={area}&year={year}&step=search"
 
         cache_key = f"ga/{year}.html"
 
@@ -49,33 +86,11 @@ def scrape(
             html = page.text
             cache.write(cache_key, html)
 
-        # Parse out data table
-        soup = BeautifulSoup(html, "html.parser")
-        table_list = soup.find_all(id="emplrList")  # output is list-type
-        
-        # We expect the first table to be there with our data
-        assert len(table_list) > 0
-        table = table_list[0]
-
-        # Loop through the table and grab the data
-        for table_row in table.find_all("tr"):
-            columns = table_row.find_all(column_tags)
-            output_row = []
-
-            for column in columns:
-                # Collapse newlines
-                partial = re.sub(r"\n", " ", column.text)
-                # Standardize whitespace
-                clean_text = re.sub(r"\s+", " ", partial).strip()
-                output_row.append(clean_text)
-            # Skip any empty rows
-            if len(output_row) == 0 or output_row == [""]:
-                continue
-
-            output_rows.append(output_row)
-
-        # exclude column headers on subsequent years
-        column_tags = ["td"]
+        new_rows = parse_table(html, "emplrList", include_headers=include_headers)
+
+        output_rows = output_rows + new_rows
+
+        include_headers = False
 
     # Write out the data to a CSV
     data_path = f"{data_dir}/ga.csv"

From 0a15dc563b032ffce40967d62171640f348dccce Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 12:08:15 -0500
Subject: [PATCH 08/12] refactor(ga): tiny cosmetic changes

---
 warn/scrapers/ga.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index ac220a2e..c1573eb5 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -75,13 +75,11 @@ def scrape(
 
     for year in years:
         url = f"{base_url}?geoArea={area}&year={year}&step=search"
-
         cache_key = f"ga/{year}.html"
 
         if cache.exists(cache_key) and year < current_year:
             html = cache.read(cache_key)
         else:
-            # Get URL
             page = utils.get_url(url)
             html = page.text
             cache.write(cache_key, html)

From 46a670f46d333dfb798e6307562f5b8307476aad Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 12:11:18 -0500
Subject: [PATCH 09/12] docs(ga): add docstring to parse_table

---
 warn/scrapers/ga.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index c1573eb5..6836861e 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -8,7 +8,18 @@
 from ..cache import Cache
 
 
-def parse_table(html, id, include_headers=False):
+def parse_table(html, id, include_headers=True):
+    """
+    Parse HTML table with given ID.
+
+    Keyword arguments:
+    html -- the HTML to parse
+    id -- the ID of the table to parse
+    include_headers -- whether to include the headers in the output (default True)
+
+    Returns: a list of rows
+    """
+
     # Parse out data table
     soup = BeautifulSoup(html, "html.parser")
     table_list = soup.find_all(id=id)  # output is list-type
@@ -57,6 +68,7 @@ def scrape(
 
     Returns: the Path where the file is written
     """
+
     base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list"
 
     cache = Cache(cache_dir)

From 1f47801940161006568fc12a12b39883864fff75 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 12:15:18 -0500
Subject: [PATCH 10/12] refactor(ga): define and reuse state_code

---
 warn/scrapers/ga.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index 6836861e..62ac4f61 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -69,6 +69,7 @@ def scrape(
     Returns: the Path where the file is written
     """
 
+    state_code = "ga"
     base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list"
 
     cache = Cache(cache_dir)
@@ -87,7 +88,7 @@ def scrape(
 
     for year in years:
         url = f"{base_url}?geoArea={area}&year={year}&step=search"
-        cache_key = f"ga/{year}.html"
+        cache_key = f"{state_code}/{year}.html"
 
         if cache.exists(cache_key) and year < current_year:
             html = cache.read(cache_key)
@@ -103,7 +104,7 @@ def scrape(
         include_headers = False
 
     # Write out the data to a CSV
-    data_path = f"{data_dir}/ga.csv"
+    data_path = f"{data_dir}/{state_code}.csv"
     utils.write_rows_to_csv(output_rows, data_path)
 
     # Return the Path to the CSV

From 3313aea9e2a943f6e36cc0383e6c73763ff20b1f Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 12:20:54 -0500
Subject: [PATCH 11/12] ga(refactor): move variables, modify comments

---
 warn/scrapers/ga.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index 62ac4f61..e792c9fb 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -69,10 +69,11 @@ def scrape(
     Returns: the Path where the file is written
     """
 
+    cache = Cache(cache_dir)
+
     state_code = "ga"
     base_url = "https://www.dol.state.ga.us/public/es/warn/searchwarns/list"
-
-    cache = Cache(cache_dir)
+    data_path = f"{data_dir}/{state_code}.csv"
 
     area = 9  # statewide
 
@@ -103,11 +104,9 @@ def scrape(
 
         include_headers = False
 
-    # Write out the data to a CSV
-    data_path = f"{data_dir}/{state_code}.csv"
     utils.write_rows_to_csv(output_rows, data_path)
 
-    # Return the Path to the CSV
+    # Return the path to the CSV
     return data_path
 
 

From 9e30a18be344ceaaa34af1247eed91adb1a32574 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Fri, 21 Jan 2022 12:22:57 -0500
Subject: [PATCH 12/12] feat(ga): also get prior year regardless of cache

---
 warn/scrapers/ga.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/warn/scrapers/ga.py b/warn/scrapers/ga.py
index e792c9fb..751c8aae 100644
--- a/warn/scrapers/ga.py
+++ b/warn/scrapers/ga.py
@@ -91,7 +91,8 @@ def scrape(
         url = f"{base_url}?geoArea={area}&year={year}&step=search"
         cache_key = f"{state_code}/{year}.html"
 
-        if cache.exists(cache_key) and year < current_year:
+        # Read from cache if available and not this year or the year before
+        if cache.exists(cache_key) and year < current_year - 1:
             html = cache.read(cache_key)
         else:
             page = utils.get_url(url)