biglocalnews · stucka · Dec 23, 2024 · Dec 22, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/Pipfile b/Pipfile
@@ -45,6 +45,9 @@ click = "*"
 xlrd = "*"
 retry = "*"
 urllib3 = "1.26.18" # pegged to avoid test issue
+selenium = "*"
+webdriver-manager = "*"
+cryptography = "*"
 
 [requires]
 python_version = "3.9"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/setup.py b/setup.py
@@ -133,9 +133,11 @@ def run(self):
         "pdfplumber",
         "requests",
         "openpyxl",
-        "xlrd",
-        "tenacity",
         "retry",
+        "selenium",
+        "tenacity",
+        "xlrd",
+        "webdriver-manager",
     ],
     license="Apache 2.0 license",
     zip_safe=False,

diff --git a/warn/cache.py b/warn/cache.py
@@ -69,7 +69,7 @@ def read_csv(self, name):
         """
         path = Path(self.path, name)
         logger.debug(f"Reading CSV from cache {path}")
-        with open(path) as fh:
+        with open(path, encoding="utf-8") as fh:
             return list(csv.reader(fh))
 
     def download(

diff --git a/warn/scrapers/va.py b/warn/scrapers/va.py
@@ -1,13 +1,20 @@
+import datetime
 import logging
+import os
+from glob import glob
 from pathlib import Path
+from shutil import copyfile
+from time import sleep
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
 
 from .. import utils
 from ..cache import Cache
 
-# from bs4 import BeautifulSoup, Tag
-
-
-__authors__ = ["zstumgoren", "Dilcia19", "shallotly"]
+__authors__ = ["zstumgoren", "Dilcia19", "shallotly", "stucka"]
 __tags__ = ["html", "csv"]
 __source__ = {
     "name": "Virginia Employment Commission",
@@ -30,37 +37,127 @@ def scrape(
 
     Returns: the Path where the file is written
     """
-    # This scraper initially tried to get a CSV download link that was only for the most recent entries. The scraping part of that broke.
-    # It's now hard-coded to a particular download link with parameters that should get the full thing.
+    cache = Cache(cache_dir)
+    csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
+
+    """
+    This scraper originally tried to parse HTML to find a CSV download link.
+    The HTML scraping portion broke in early December 2024. The code had
+    also been downloading an incomplete slice of the data.
+
+    In late December 2024, everything broke because Virginia decided to begin
+    testing for Javascript-aware browsers. This code is the way it is because
+    every alternative considered was somehow worse. Not helping? Losing about
+    four hours of work including the extensive documentation on the
+    alternatives sought.
+
+    Virginia's protections require a JS-aware browser to evaluate some
+    obscurred, frequently changing code to set some short-lived cookies.
+    Without those cookies, no code. And even headless browsers get blocked
+    by a video test. Really unfun. So a ... headed? ... JS-aware browser
+    is required.
+
+    Some things evaluated included, off memory:
+
+    -- Using Playwright instead. This looked like a reasonable approach but
+        was awful resource-wise. Playwright itself had significant overhead,
+        partially from requiring its own version of browsers to be installed.
+        There's apparently some way with YAML to try to get Github Actions,
+        where this project is in production, to install only for particular
+        branches. Without that code, this'd be pending a couple minutes
+        several times a day on each of about 40 different branches of code.
+    -- Using Selenium. This is where it ultimately landed. It's not great,
+        but after trying about a dozen alernatives it's the best we got.
+    -- Installation code for Chrome's driver started acting flaky between
+        platforms.
+    -- PhantomJS couldn't even get past the first brush with the protection.
+    -- The optimal file is the CSV created by the state with well-defined
+        fields. Unfortunately, hitting the link once approved by the
+        Javascript results in an immediate download. There's no regular way
+        to get the file path through Javascript. Backdoor efforts like trying
+        to go through the Download menu also failed, because Chrome puts
+        them into a Shadow DOM. Several hunks of code to try to access the
+        Shadow DOM and get at the local filename are no longer functional
+        in Chrome. Building an extension to track some of this ... is not
+        an option, and loading it the first time would require human
+        intervention rather than automation. There might be a way to mess
+        with the Shadow DOM through CSS manipulation, but that looked to
+        weird to bother trying especially given other more reasonable measures
+        that no longer worked.
+    -- Also, efforts to get at the CSV through view-source failed.
+    -- And it's possible to scrape the HTML and try to parse it back out for
+        what warn-scraper needs, but that seemed even more fraught than trying
+        to get the CSV.
+    -- So if the filename isn't obtainable through Chrome, where do we get it?
+        There's a multiplatform way to get at a user's home directory. For
+        many people Downloads is off there, at ... ~/Downloads, capital D,
+        plural. Except people can configure that differently. And most
+        languages won't call it Downloads. And Chrome of course lets people
+        set a default download location that can be anywhere else, or select
+        a per-file location ("Ask me where to save this" or some such).
+        After going down even more rabbit holes, ... ~/Downloads is all that
+        gets implemented here.
+    -- I tried to see if Firefox might be a little less grumpy. One Python
+        driver-finder got one day of commits. A fork has Issues turned off
+        somehow. The third one I looked at was the one that was grumpy for
+        Chrome, and its maintainer is apparently trying to protect his
+        homeland with FPV drones. So ... back to Chrome.
+
+    So, yes, this is a weird implementation. It's a terrible model. It's
+    even got a hard-coded wait. At least as of late December 2024, however,
+    it does work. ... in late December 2024.
+    """
 
-    # This may break again, but this revised attempt has far fewer moving parts and actually fetches the complete data set.
-    # Blame Stucka in December 2024.
+    # driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
+    logger.debug("Attempting to launch Chrome")
+    chromeoptionsholder = ChromeOptions()
+    chrome_install = ChromeDriverManager().install()
 
-    # Get the WARN page
-    # url = "https://www.vec.virginia.gov/warn-notices"
-    # url = "https://vec.virginia.gov/warn-notices?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
-    # r = utils.get_url(url, verify=True)
-    # html = r.text
+    # Weird error with finding the driver name in Windows. Sometimes.
+    if chrome_install.endswith("THIRD_PARTY_NOTICES.chromedriver"):
+        chrome_install = chrome_install.replace(
+            "THIRD_PARTY_NOTICES.chromedriver", "chromedriver.exe"
+        )
+    logger.debug(f"Chrome install variable is {chrome_install}")
+    # folder = os.path.dirname(chrome_install)
+    # chromedriver_path = folder #  os.path.join(folder, "chromedriver.exe")
+    # service = ChromeService(chromedriver_path)
+    service = ChromeService(chrome_install)
+    driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
+    logger.debug(f"Attempting to fetch {csv_url}")
+    driver.get(csv_url)
 
-    # Save it to the cache
-    cache = Cache(cache_dir)
-    # cache.write("va/source.html", html)
+    sleep(30)  # Give it plenty of time to evaluate Javascript
+
+    download_dir = os.path.expanduser("~") + "/Downloads"
+
+    if not os.path.isdir(download_dir):
+        logger.error(f"The download directory is not {download_dir}.")
+
+    # get the list of files
+    list_of_files = glob(download_dir + "/warn-notices-csv*.csv")
+    if len(list_of_files) == 0:
+        logger.error(f"No matching files found in {download_dir}.")
+
+    # get the latest file name
+    latest_file = max(list_of_files, key=os.path.getctime)
+    latest_file_time = datetime.datetime.fromtimestamp(os.path.getctime(latest_file))
+
+    # print the latest file name
+    logger.debug(f"CSV saved to {latest_file}, saved at {latest_file_time}")
+
+    target_filename = cache_dir / "va" / "source.csv"
+
+    utils.create_directory(path=cache_dir / "va", is_file=False)
 
-    # Parse out the CSV download link
-    # soup = BeautifulSoup(html, "html.parser")
-    # csv_link = soup.find("a", text="Download")
-    # if isinstance(csv_link, Tag):
-    #     csv_href = csv_link["href"]
-    # else:
-    #     raise ValueError("Could not find CSV link")
+    logger.debug(f"Saving file to {target_filename}")
 
-    # csv_href = "/warn-notices-csv.csv?"
-    # csv_url = f"https://www.vec.virginia.gov{csv_href}"
+    copyfile(latest_file, target_filename)
 
-    csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
+    driver.quit()
 
     # Download it to the cache
-    cache.download("va/source.csv", csv_url, verify=True)
+    # cache.download("va/source.csv", csv_url, verify=True)
 
     # Open it up as a list of rows
     csv_rows = cache.read_csv("va/source.csv")

diff --git a/warn/utils.py b/warn/utils.py
@@ -86,7 +86,7 @@ def save_if_good_url(filename, url, **kwargs):
         success_flag = False
         content = False
     else:
-        with open(filename, "wb") as outfile:
+        with open(filename, "wb", encoding="utf-8") as outfile:
             outfile.write(response.content)
             success_flag = True
             content = response.content
@@ -104,7 +104,7 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
     """
     create_directory(output_path, is_file=True)
     logger.debug(f"Writing {len(rows)} rows to {output_path}")
-    with open(output_path, mode, newline="") as f:
+    with open(output_path, mode, newline="", encoding="utf-8") as f:
         writer = csv.writer(f)
         writer.writerows(rows)