Do not use wait_until='networkidle' by default with fetch_javascript_…

…url (#340) It does not work in all situation, especially with GitHub-rendered markdown files.
endoflife-date · Apr 2, 2024 · 08d4ea4 · 08d4ea4
1 parent 81b6558
commit 08d4ea4
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 9 deletions.
diff --git a/releases/apache-airflow.json b/releases/apache-airflow.json
@@ -262,4 +262,4 @@
       "date": "2017-05-09"
     }
   }
-}
+}
diff --git a/src/artifactory.py b/src/artifactory.py
@@ -5,7 +5,7 @@
 needed to render the page."""
 
 with releasedata.ProductData("artifactory") as product_data:
-    content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life')
+    content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life', wait_until = 'networkidle')
     soup = BeautifulSoup(content, 'html.parser')
 
     for row in soup.select('.informaltable tbody tr'):

diff --git a/src/common/http.py b/src/common/http.py
@@ -38,7 +38,8 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None
             raise e  # So that the function does not get stuck in an infinite loop.
 
         # We could wait a bit before retrying, but it's not clear if it would help.
-        logging.warning(f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).")
+        logging.warning(
+            f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).")
         return fetch_urls(urls, data, headers, next_max_retries, backoff_factor, timeout)
 
 
@@ -48,13 +49,13 @@ def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
 
 
 # This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
-def fetch_javascript_url(url: str, click_selector: str = None) -> str:
-    logging.info(f"Fetching {url}")
+def fetch_javascript_url(url: str, click_selector: str = None, wait_until: str = None) -> str:
+    logging.info(f"Fetching {url} with JavaScript (click_selector = {click_selector}, wait_until = {wait_until})")
     with sync_playwright() as p:
         browser = p.chromium.launch()
         try:
             page = browser.new_page()
-            page.goto(url, wait_until='networkidle')
+            page.goto(url, wait_until=wait_until)
             if click_selector:
                 logging.info(f"Clicked on {click_selector}")
                 page.click(click_selector)

diff --git a/src/oracle-jdk.py b/src/oracle-jdk.py
@@ -6,7 +6,7 @@
 This script is using requests-html because the page needs JavaScript to render correctly."""
 
 with releasedata.ProductData("oracle-jdk") as product_data:
-    content = http.fetch_javascript_url('https://www.java.com/releases/')
+    content = http.fetch_javascript_url('https://www.java.com/releases/', wait_until='networkidle')
     soup = BeautifulSoup(content, 'html.parser')
 
     previous_date = None

diff --git a/src/release_table.py b/src/release_table.py
@@ -19,7 +19,9 @@
 - rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows.
 - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
 - render_javascript_click_selector (optional, default = None): A playwright selector used to click on an element after
-  the JavaScript rendering.
+  the JavaScript rendering. Only use when render_javascript is true.
+- render_javascript_wait_until (optional, default = None): Argument to pass to Playwright, one of "commit",
+  "domcontentloaded", "load", or "networkidle". Only use when render_javascript is true and if the script fails without it.
 - ignore_empty_releases (optional, default = false): A boolean value indicating whether to ignore releases with no
   fields except the name.
 - fields: A dictionary that maps release fields to the table's columns. Field definition include:
@@ -146,6 +148,7 @@ def __repr__(self) -> str:
     with releasedata.ProductData(config.product) as product_data:
         render_javascript = config.data.get("render_javascript", False)
         render_javascript_click_selector = config.data.get("render_javascript_click_selector", None)
+        render_javascript_wait_until = config.data.get("render_javascript_wait_until", None)
         ignore_empty_releases = config.data.get("ignore_empty_releases", False)
         header_row_selector = config.data.get("header_selector", "thead tr")
         rows_selector = config.data.get("rows_selector", "tbody tr")
@@ -154,7 +157,8 @@ def __repr__(self) -> str:
         fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
 
         if render_javascript:
-            response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector)
+            response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector,
+                                                      wait_until=render_javascript_wait_until)
         else:
             response_text = http.fetch_url(config.url).text
         soup = BeautifulSoup(response_text, features="html5lib")
-Original file line number
+Diff line change
@@ Expand Up / @@ -262,4 +262,4 @@ @@
           "date": "2017-05-09"
         }
       }
-    }
+    }