From 08d4ea469ef12c55ce27372799ee447000311519 Mon Sep 17 00:00:00 2001 From: Marc Wrobel Date: Tue, 2 Apr 2024 22:28:18 +0200 Subject: [PATCH] Do not use wait_until='networkidle' by default with fetch_javascript_url (#340) It does not work in all situation, especially with GitHub-rendered markdown files. --- releases/apache-airflow.json | 2 +- src/artifactory.py | 2 +- src/common/http.py | 9 +++++---- src/oracle-jdk.py | 2 +- src/release_table.py | 8 ++++++-- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/releases/apache-airflow.json b/releases/apache-airflow.json index fe163269..87c0e0c9 100644 --- a/releases/apache-airflow.json +++ b/releases/apache-airflow.json @@ -262,4 +262,4 @@ "date": "2017-05-09" } } -} +} \ No newline at end of file diff --git a/src/artifactory.py b/src/artifactory.py index 77ca3766..390f5253 100644 --- a/src/artifactory.py +++ b/src/artifactory.py @@ -5,7 +5,7 @@ needed to render the page.""" with releasedata.ProductData("artifactory") as product_data: - content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life') + content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life', wait_until = 'networkidle') soup = BeautifulSoup(content, 'html.parser') for row in soup.select('.informaltable tbody tr'): diff --git a/src/common/http.py b/src/common/http.py index 4dd277aa..7f57d12d 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -38,7 +38,8 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None raise e # So that the function does not get stuck in an infinite loop. # We could wait a bit before retrying, but it's not clear if it would help. - logging.warning(f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).") + logging.warning( + f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).") return fetch_urls(urls, data, headers, next_max_retries, backoff_factor, timeout) @@ -48,13 +49,13 @@ def fetch_url(url: str, data: any = None, headers: dict[str, str] = None, # This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright. -def fetch_javascript_url(url: str, click_selector: str = None) -> str: - logging.info(f"Fetching {url}") +def fetch_javascript_url(url: str, click_selector: str = None, wait_until: str = None) -> str: + logging.info(f"Fetching {url} with JavaScript (click_selector = {click_selector}, wait_until = {wait_until})") with sync_playwright() as p: browser = p.chromium.launch() try: page = browser.new_page() - page.goto(url, wait_until='networkidle') + page.goto(url, wait_until=wait_until) if click_selector: logging.info(f"Clicked on {click_selector}") page.click(click_selector) diff --git a/src/oracle-jdk.py b/src/oracle-jdk.py index 5ff36ce2..21feabb8 100644 --- a/src/oracle-jdk.py +++ b/src/oracle-jdk.py @@ -6,7 +6,7 @@ This script is using requests-html because the page needs JavaScript to render correctly.""" with releasedata.ProductData("oracle-jdk") as product_data: - content = http.fetch_javascript_url('https://www.java.com/releases/') + content = http.fetch_javascript_url('https://www.java.com/releases/', wait_until='networkidle') soup = BeautifulSoup(content, 'html.parser') previous_date = None diff --git a/src/release_table.py b/src/release_table.py index eace8d3f..391bc807 100644 --- a/src/release_table.py +++ b/src/release_table.py @@ -19,7 +19,9 @@ - rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows. - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page. - render_javascript_click_selector (optional, default = None): A playwright selector used to click on an element after - the JavaScript rendering. + the JavaScript rendering. Only use when render_javascript is true. +- render_javascript_wait_until (optional, default = None): Argument to pass to Playwright, one of "commit", + "domcontentloaded", "load", or "networkidle". Only use when render_javascript is true and if the script fails without it. - ignore_empty_releases (optional, default = false): A boolean value indicating whether to ignore releases with no fields except the name. - fields: A dictionary that maps release fields to the table's columns. Field definition include: @@ -146,6 +148,7 @@ def __repr__(self) -> str: with releasedata.ProductData(config.product) as product_data: render_javascript = config.data.get("render_javascript", False) render_javascript_click_selector = config.data.get("render_javascript_click_selector", None) + render_javascript_wait_until = config.data.get("render_javascript_wait_until", None) ignore_empty_releases = config.data.get("ignore_empty_releases", False) header_row_selector = config.data.get("header_selector", "thead tr") rows_selector = config.data.get("rows_selector", "tbody tr") @@ -154,7 +157,8 @@ def __repr__(self) -> str: fields = [Field(name, definition) for name, definition in config.data["fields"].items()] if render_javascript: - response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector) + response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector, + wait_until=render_javascript_wait_until) else: response_text = http.fetch_url(config.url).text soup = BeautifulSoup(response_text, features="html5lib")