From e451698822a5df1e2639d40fce2b3a180d0a7493 Mon Sep 17 00:00:00 2001
From: modhurita <modhurita@gmail.com>
Date: Mon, 29 Jan 2024 15:03:17 +0100
Subject: [PATCH 1/5] fix pagination issue

---
 artscraper/find_artworks.py | 52 +++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py
index 63e8ffa..a83fc25 100644
--- a/artscraper/find_artworks.py
+++ b/artscraper/find_artworks.py
@@ -209,26 +209,46 @@ def get_artist_works(self):
         # Find the parent element corresponding to the text heading
         parent_element = element.find_element('xpath', '../..')
 
-        # Find right arrow button
-        right_arrow_element = parent_element.find_element('xpath', \
-            './/*[contains(@data-gaaction,"rightArrow")]')
-
-        # Check if right arrow button can still be clicked
-        while right_arrow_element.get_attribute('tabindex') is not None:
+        # Find number of artists
+        # Find elements with tag name 'h3'
+        items_elements = parent_element.find_elements('tag name', 'h3')
+        for element in items_elements:
+            if 'items' in element.text: 
+                match = re.search(r'\d+', element.text)        
+                if match:
+                    total_num_artworks = int(match.group())
+                    break
+            
+        # Initialize number of artworks
+        num_artworks = 0
+
+        while num_artworks < total_num_artworks:
             # Find right arrow button
             right_arrow_element = parent_element.find_element('xpath', \
                 './/*[contains(@data-gaaction,"rightArrow")]')
-            # Click on right arrow button
-            self.driver.execute_script("arguments[0].click();", right_arrow_element)
-            # Wait for page to load
-            time.sleep(random_wait_time(min_wait=self.min_wait_time))
-
-        # List of all elements with links to artworks
-        elements = right_arrow_element.find_elements('xpath', \
-                '//*[contains(@href,"/asset/")]')
 
-        # Get the links from the XPath elements
-        list_links = [element.get_attribute('href') for element in elements]
+            # Check if right arrow button can still be clicked
+            if right_arrow_element.get_attribute('tabindex') is not None:
+                # Find right arrow button
+                right_arrow_element = parent_element.find_element('xpath', \
+                    './/*[contains(@data-gaaction,"rightArrow")]')
+                # Click on right arrow button
+                self.driver.execute_script("arguments[0].click();", right_arrow_element)
+                
+                # List of all elements with links to artworks
+                elements = right_arrow_element.find_elements('xpath', \
+                    '//*[contains(@href,"/asset/")]')
+
+                # Get the links from the XPath elements
+                list_links = [element.get_attribute('href') for element in elements]
+
+                num_artworks = len(list_links)
+                
+                # Check if total number of artworks is reached
+                if num_artworks < total_num_artworks:
+                    # Wait for page to load
+                    time.sleep(random_wait_time(min_wait=self.min_wait_time))
+                    continue
 
         return list_links
 

From 8c0af0935cbcf53d1811852cec58581824c0b583 Mon Sep 17 00:00:00 2001
From: modhurita <modhurita@gmail.com>
Date: Mon, 29 Jan 2024 15:08:06 +0100
Subject: [PATCH 2/5] remove trailing whitespace

---
 artscraper/find_artworks.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py
index a83fc25..b598f07 100644
--- a/artscraper/find_artworks.py
+++ b/artscraper/find_artworks.py
@@ -213,12 +213,12 @@ def get_artist_works(self):
         # Find elements with tag name 'h3'
         items_elements = parent_element.find_elements('tag name', 'h3')
         for element in items_elements:
-            if 'items' in element.text: 
-                match = re.search(r'\d+', element.text)        
+            if 'items' in element.text:
+                match = re.search(r'\d+', element.text)
                 if match:
                     total_num_artworks = int(match.group())
                     break
-            
+
         # Initialize number of artworks
         num_artworks = 0
 
@@ -234,7 +234,7 @@ def get_artist_works(self):
                     './/*[contains(@data-gaaction,"rightArrow")]')
                 # Click on right arrow button
                 self.driver.execute_script("arguments[0].click();", right_arrow_element)
-                
+
                 # List of all elements with links to artworks
                 elements = right_arrow_element.find_elements('xpath', \
                     '//*[contains(@href,"/asset/")]')
@@ -243,7 +243,7 @@ def get_artist_works(self):
                 list_links = [element.get_attribute('href') for element in elements]
 
                 num_artworks = len(list_links)
-                
+
                 # Check if total number of artworks is reached
                 if num_artworks < total_num_artworks:
                     # Wait for page to load

From 5374fc4eb4b6f2e14e705af2b96300a327ceaeaa Mon Sep 17 00:00:00 2001
From: modhurita <modhurita@gmail.com>
Date: Tue, 30 Jan 2024 10:45:33 +0100
Subject: [PATCH 3/5] avoid possibility of infinite loop

---
 artscraper/find_artworks.py | 41 ++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py
index b598f07..4c47a38 100644
--- a/artscraper/find_artworks.py
+++ b/artscraper/find_artworks.py
@@ -221,12 +221,28 @@ def get_artist_works(self):
 
         # Initialize number of artworks
         num_artworks = 0
+        # Initialize count of number of iterations for which the number of artworks remains the same
+        count = 0
+
+        while True:
+
+            # Save current number of artworks
+            old_num_artworks = num_artworks
 
-        while num_artworks < total_num_artworks:
             # Find right arrow button
             right_arrow_element = parent_element.find_element('xpath', \
                 './/*[contains(@data-gaaction,"rightArrow")]')
 
+            # List of all elements with links to artworks
+            elements = right_arrow_element.find_elements('xpath', \
+                '//*[contains(@href,"/asset/")]')
+
+            # Get the links from the XPath elements
+            list_links = [element.get_attribute('href') for element in elements]
+
+            # Calculate new number of artworks
+            num_artworks = len(list_links)
+
             # Check if right arrow button can still be clicked
             if right_arrow_element.get_attribute('tabindex') is not None:
                 # Find right arrow button
@@ -242,13 +258,28 @@ def get_artist_works(self):
                 # Get the links from the XPath elements
                 list_links = [element.get_attribute('href') for element in elements]
 
+                # Calculate new number of artworks
                 num_artworks = len(list_links)
 
                 # Check if total number of artworks is reached
-                if num_artworks < total_num_artworks:
-                    # Wait for page to load
-                    time.sleep(random_wait_time(min_wait=self.min_wait_time))
-                    continue
+                if total_num_artworks:
+                    if num_artworks < total_num_artworks:
+                        # Wait for page to load
+                        time.sleep(random_wait_time(min_wait=self.min_wait_time))
+                        continue
+                    # Break out of the while loop if total_num_artworks is reached
+                    break
+
+            if num_artworks > old_num_artworks:
+                # Count number of iterations for which the number of artworks remains the same
+                count = 0
+            else:
+                count = count+1
+
+            # Try thrice before deciding that there are no more artworks to be scraped
+            if count > 3:
+                break
+
 
         return list_links
 

From 141f29480bfd03b33850e44ebc98729d47b606a1 Mon Sep 17 00:00:00 2001
From: modhurita <modhurita@gmail.com>
Date: Thu, 1 Feb 2024 12:17:33 +0100
Subject: [PATCH 4/5] refine handling of edge cases

---
 artscraper/find_artworks.py | 89 ++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 41 deletions(-)

diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py
index 4c47a38..4f05f11 100644
--- a/artscraper/find_artworks.py
+++ b/artscraper/find_artworks.py
@@ -209,6 +209,10 @@ def get_artist_works(self):
         # Find the parent element corresponding to the text heading
         parent_element = element.find_element('xpath', '../..')
 
+        # Initialize total number of artworks
+        # (set to number of artworks by artist with the most artworks)
+        total_num_artworks = 200000
+
         # Find number of artists
         # Find elements with tag name 'h3'
         items_elements = parent_element.find_elements('tag name', 'h3')
@@ -219,15 +223,16 @@ def get_artist_works(self):
                     total_num_artworks = int(match.group())
                     break
 
-        # Initialize number of artworks
-        num_artworks = 0
-        # Initialize count of number of iterations for which the number of artworks remains the same
-        count = 0
+        # Find right arrow element
+        def _find_right_arrow_element(parent_element):
+
+            right_arrow_element = parent_element.find_element('xpath', \
+                './/*[contains(@data-gaaction,"rightArrow")]')
 
-        while True:
+            return right_arrow_element
 
-            # Save current number of artworks
-            old_num_artworks = num_artworks
+        # Get list of artwork links
+        def _get_list_links(parent_element):
 
             # Find right arrow button
             right_arrow_element = parent_element.find_element('xpath', \
@@ -240,46 +245,48 @@ def get_artist_works(self):
             # Get the links from the XPath elements
             list_links = [element.get_attribute('href') for element in elements]
 
-            # Calculate new number of artworks
-            num_artworks = len(list_links)
+            return list_links
+
+        # Click on right arrow
+        def _click_on_right_arrow(parent_element):
+
+            # Find right arrow button
+            right_arrow_element = parent_element.find_element('xpath', \
+                './/*[contains(@data-gaaction,"rightArrow")]')
+            # Click on right arrow button
+            self.driver.execute_script("arguments[0].click();", right_arrow_element)
+
+        list_links = _get_list_links(parent_element)
+
+        # Initialize count of number of iterations for which the number of artworks remains the same
+        n_tries = 0
+
+        while (len(list_links) < total_num_artworks and
+               not (total_num_artworks == 0 and n_tries > 3)):
+
+            # Save current number of artworks
+            old_num_artworks = len(list_links)
+
+            # Find right arrow element
+            right_arrow_element =  _find_right_arrow_element(parent_element)
 
             # Check if right arrow button can still be clicked
             if right_arrow_element.get_attribute('tabindex') is not None:
-                # Find right arrow button
-                right_arrow_element = parent_element.find_element('xpath', \
-                    './/*[contains(@data-gaaction,"rightArrow")]')
-                # Click on right arrow button
-                self.driver.execute_script("arguments[0].click();", right_arrow_element)
-
-                # List of all elements with links to artworks
-                elements = right_arrow_element.find_elements('xpath', \
-                    '//*[contains(@href,"/asset/")]')
-
-                # Get the links from the XPath elements
-                list_links = [element.get_attribute('href') for element in elements]
-
-                # Calculate new number of artworks
-                num_artworks = len(list_links)
-
-                # Check if total number of artworks is reached
-                if total_num_artworks:
-                    if num_artworks < total_num_artworks:
-                        # Wait for page to load
-                        time.sleep(random_wait_time(min_wait=self.min_wait_time))
-                        continue
-                    # Break out of the while loop if total_num_artworks is reached
-                    break
 
-            if num_artworks > old_num_artworks:
-                # Count number of iterations for which the number of artworks remains the same
-                count = 0
-            else:
-                count = count+1
+                # Click on right arrow
+                _click_on_right_arrow(parent_element)
 
-            # Try thrice before deciding that there are no more artworks to be scraped
-            if count > 3:
-                break
+                # Wait for page to load
+                time.sleep(random_wait_time(min_wait=self.min_wait_time))
 
+                # Obtain new list of artworks
+                list_links = _get_list_links(parent_element)
+
+            if len(list_links) == old_num_artworks:
+                # Count number of iterations for which the number of artworks remains the same
+                n_tries = n_tries + 1
+            else:
+                n_tries = 0
 
         return list_links
 

From 29060b955af6b164dd492432d419b6de0752f3f2 Mon Sep 17 00:00:00 2001
From: modhurita <modhurita@gmail.com>
Date: Thu, 1 Feb 2024 14:18:45 +0100
Subject: [PATCH 5/5] remove check for total number of artworks being 0

---
 artscraper/find_artworks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py
index 4f05f11..839da03 100644
--- a/artscraper/find_artworks.py
+++ b/artscraper/find_artworks.py
@@ -261,8 +261,7 @@ def _click_on_right_arrow(parent_element):
         # Initialize count of number of iterations for which the number of artworks remains the same
         n_tries = 0
 
-        while (len(list_links) < total_num_artworks and
-               not (total_num_artworks == 0 and n_tries > 3)):
+        while (len(list_links) < total_num_artworks and n_tries < 3):
 
             # Save current number of artworks
             old_num_artworks = len(list_links)