From e451698822a5df1e2639d40fce2b3a180d0a7493 Mon Sep 17 00:00:00 2001 From: modhurita Date: Mon, 29 Jan 2024 15:03:17 +0100 Subject: [PATCH 1/5] fix pagination issue --- artscraper/find_artworks.py | 52 +++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 63e8ffa..a83fc25 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -209,26 +209,46 @@ def get_artist_works(self): # Find the parent element corresponding to the text heading parent_element = element.find_element('xpath', '../..') - # Find right arrow button - right_arrow_element = parent_element.find_element('xpath', \ - './/*[contains(@data-gaaction,"rightArrow")]') - - # Check if right arrow button can still be clicked - while right_arrow_element.get_attribute('tabindex') is not None: + # Find number of artists + # Find elements with tag name 'h3' + items_elements = parent_element.find_elements('tag name', 'h3') + for element in items_elements: + if 'items' in element.text: + match = re.search(r'\d+', element.text) + if match: + total_num_artworks = int(match.group()) + break + + # Initialize number of artworks + num_artworks = 0 + + while num_artworks < total_num_artworks: # Find right arrow button right_arrow_element = parent_element.find_element('xpath', \ './/*[contains(@data-gaaction,"rightArrow")]') - # Click on right arrow button - self.driver.execute_script("arguments[0].click();", right_arrow_element) - # Wait for page to load - time.sleep(random_wait_time(min_wait=self.min_wait_time)) - - # List of all elements with links to artworks - elements = right_arrow_element.find_elements('xpath', \ - '//*[contains(@href,"/asset/")]') - # Get the links from the XPath elements - list_links = [element.get_attribute('href') for element in elements] + # Check if right arrow button can still be clicked + if right_arrow_element.get_attribute('tabindex') is not None: + # Find right arrow button + right_arrow_element = parent_element.find_element('xpath', \ + './/*[contains(@data-gaaction,"rightArrow")]') + # Click on right arrow button + self.driver.execute_script("arguments[0].click();", right_arrow_element) + + # List of all elements with links to artworks + elements = right_arrow_element.find_elements('xpath', \ + '//*[contains(@href,"/asset/")]') + + # Get the links from the XPath elements + list_links = [element.get_attribute('href') for element in elements] + + num_artworks = len(list_links) + + # Check if total number of artworks is reached + if num_artworks < total_num_artworks: + # Wait for page to load + time.sleep(random_wait_time(min_wait=self.min_wait_time)) + continue return list_links From 8c0af0935cbcf53d1811852cec58581824c0b583 Mon Sep 17 00:00:00 2001 From: modhurita Date: Mon, 29 Jan 2024 15:08:06 +0100 Subject: [PATCH 2/5] remove trailing whitespace --- artscraper/find_artworks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index a83fc25..b598f07 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -213,12 +213,12 @@ def get_artist_works(self): # Find elements with tag name 'h3' items_elements = parent_element.find_elements('tag name', 'h3') for element in items_elements: - if 'items' in element.text: - match = re.search(r'\d+', element.text) + if 'items' in element.text: + match = re.search(r'\d+', element.text) if match: total_num_artworks = int(match.group()) break - + # Initialize number of artworks num_artworks = 0 @@ -234,7 +234,7 @@ def get_artist_works(self): './/*[contains(@data-gaaction,"rightArrow")]') # Click on right arrow button self.driver.execute_script("arguments[0].click();", right_arrow_element) - + # List of all elements with links to artworks elements = right_arrow_element.find_elements('xpath', \ '//*[contains(@href,"/asset/")]') @@ -243,7 +243,7 @@ def get_artist_works(self): list_links = [element.get_attribute('href') for element in elements] num_artworks = len(list_links) - + # Check if total number of artworks is reached if num_artworks < total_num_artworks: # Wait for page to load From 5374fc4eb4b6f2e14e705af2b96300a327ceaeaa Mon Sep 17 00:00:00 2001 From: modhurita Date: Tue, 30 Jan 2024 10:45:33 +0100 Subject: [PATCH 3/5] avoid possibility of infinite loop --- artscraper/find_artworks.py | 41 ++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index b598f07..4c47a38 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -221,12 +221,28 @@ def get_artist_works(self): # Initialize number of artworks num_artworks = 0 + # Initialize count of number of iterations for which the number of artworks remains the same + count = 0 + + while True: + + # Save current number of artworks + old_num_artworks = num_artworks - while num_artworks < total_num_artworks: # Find right arrow button right_arrow_element = parent_element.find_element('xpath', \ './/*[contains(@data-gaaction,"rightArrow")]') + # List of all elements with links to artworks + elements = right_arrow_element.find_elements('xpath', \ + '//*[contains(@href,"/asset/")]') + + # Get the links from the XPath elements + list_links = [element.get_attribute('href') for element in elements] + + # Calculate new number of artworks + num_artworks = len(list_links) + # Check if right arrow button can still be clicked if right_arrow_element.get_attribute('tabindex') is not None: # Find right arrow button @@ -242,13 +258,28 @@ def get_artist_works(self): # Get the links from the XPath elements list_links = [element.get_attribute('href') for element in elements] + # Calculate new number of artworks num_artworks = len(list_links) # Check if total number of artworks is reached - if num_artworks < total_num_artworks: - # Wait for page to load - time.sleep(random_wait_time(min_wait=self.min_wait_time)) - continue + if total_num_artworks: + if num_artworks < total_num_artworks: + # Wait for page to load + time.sleep(random_wait_time(min_wait=self.min_wait_time)) + continue + # Break out of the while loop if total_num_artworks is reached + break + + if num_artworks > old_num_artworks: + # Count number of iterations for which the number of artworks remains the same + count = 0 + else: + count = count+1 + + # Try thrice before deciding that there are no more artworks to be scraped + if count > 3: + break + return list_links From 141f29480bfd03b33850e44ebc98729d47b606a1 Mon Sep 17 00:00:00 2001 From: modhurita Date: Thu, 1 Feb 2024 12:17:33 +0100 Subject: [PATCH 4/5] refine handling of edge cases --- artscraper/find_artworks.py | 89 ++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 41 deletions(-) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 4c47a38..4f05f11 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -209,6 +209,10 @@ def get_artist_works(self): # Find the parent element corresponding to the text heading parent_element = element.find_element('xpath', '../..') + # Initialize total number of artworks + # (set to number of artworks by artist with the most artworks) + total_num_artworks = 200000 + # Find number of artists # Find elements with tag name 'h3' items_elements = parent_element.find_elements('tag name', 'h3') @@ -219,15 +223,16 @@ def get_artist_works(self): total_num_artworks = int(match.group()) break - # Initialize number of artworks - num_artworks = 0 - # Initialize count of number of iterations for which the number of artworks remains the same - count = 0 + # Find right arrow element + def _find_right_arrow_element(parent_element): + + right_arrow_element = parent_element.find_element('xpath', \ + './/*[contains(@data-gaaction,"rightArrow")]') - while True: + return right_arrow_element - # Save current number of artworks - old_num_artworks = num_artworks + # Get list of artwork links + def _get_list_links(parent_element): # Find right arrow button right_arrow_element = parent_element.find_element('xpath', \ @@ -240,46 +245,48 @@ def get_artist_works(self): # Get the links from the XPath elements list_links = [element.get_attribute('href') for element in elements] - # Calculate new number of artworks - num_artworks = len(list_links) + return list_links + + # Click on right arrow + def _click_on_right_arrow(parent_element): + + # Find right arrow button + right_arrow_element = parent_element.find_element('xpath', \ + './/*[contains(@data-gaaction,"rightArrow")]') + # Click on right arrow button + self.driver.execute_script("arguments[0].click();", right_arrow_element) + + list_links = _get_list_links(parent_element) + + # Initialize count of number of iterations for which the number of artworks remains the same + n_tries = 0 + + while (len(list_links) < total_num_artworks and + not (total_num_artworks == 0 and n_tries > 3)): + + # Save current number of artworks + old_num_artworks = len(list_links) + + # Find right arrow element + right_arrow_element = _find_right_arrow_element(parent_element) # Check if right arrow button can still be clicked if right_arrow_element.get_attribute('tabindex') is not None: - # Find right arrow button - right_arrow_element = parent_element.find_element('xpath', \ - './/*[contains(@data-gaaction,"rightArrow")]') - # Click on right arrow button - self.driver.execute_script("arguments[0].click();", right_arrow_element) - - # List of all elements with links to artworks - elements = right_arrow_element.find_elements('xpath', \ - '//*[contains(@href,"/asset/")]') - - # Get the links from the XPath elements - list_links = [element.get_attribute('href') for element in elements] - - # Calculate new number of artworks - num_artworks = len(list_links) - - # Check if total number of artworks is reached - if total_num_artworks: - if num_artworks < total_num_artworks: - # Wait for page to load - time.sleep(random_wait_time(min_wait=self.min_wait_time)) - continue - # Break out of the while loop if total_num_artworks is reached - break - if num_artworks > old_num_artworks: - # Count number of iterations for which the number of artworks remains the same - count = 0 - else: - count = count+1 + # Click on right arrow + _click_on_right_arrow(parent_element) - # Try thrice before deciding that there are no more artworks to be scraped - if count > 3: - break + # Wait for page to load + time.sleep(random_wait_time(min_wait=self.min_wait_time)) + # Obtain new list of artworks + list_links = _get_list_links(parent_element) + + if len(list_links) == old_num_artworks: + # Count number of iterations for which the number of artworks remains the same + n_tries = n_tries + 1 + else: + n_tries = 0 return list_links From 29060b955af6b164dd492432d419b6de0752f3f2 Mon Sep 17 00:00:00 2001 From: modhurita Date: Thu, 1 Feb 2024 14:18:45 +0100 Subject: [PATCH 5/5] remove check for total number of artworks being 0 --- artscraper/find_artworks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 4f05f11..839da03 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -261,8 +261,7 @@ def _click_on_right_arrow(parent_element): # Initialize count of number of iterations for which the number of artworks remains the same n_tries = 0 - while (len(list_links) < total_num_artworks and - not (total_num_artworks == 0 and n_tries > 3)): + while (len(list_links) < total_num_artworks and n_tries < 3): # Save current number of artworks old_num_artworks = len(list_links)