sodascience · qubixes · Jul 10, 2023 · Jun 20, 2023 · Jun 27, 2023 · Jun 28, 2023
diff --git a/artscraper/__init__.py b/artscraper/__init__.py
@@ -1,6 +1,6 @@
 """Scrape art image and metadata from WikiArt and Google Arts."""
 
-from artscraper.functions import random_wait_time
+from artscraper.functions import random_wait_time, retry
 from artscraper.googleart import GoogleArtScraper
 from artscraper.wikiart import WikiArtScraper
 from artscraper.find_artworks import FindArtworks

diff --git a/artscraper/base.py b/artscraper/base.py
@@ -141,8 +141,9 @@ def save_metadata(self, meta_fp=None):
             return
         metadata = self.get_metadata()
         self.paint_dir.mkdir(exist_ok=True)
-        with open(meta_fp, "w", encoding="utf-8") as f:  # pylint: disable=invalid-name
-            json.dump(metadata, f)
+        with open(meta_fp, "w", encoding='utf-8') as f:
+            # pylint: disable=invalid-name
+            json.dump(metadata, f, ensure_ascii=False)
 
     @abstractmethod
     def save_image(self, img_fp=None, link=None):

diff --git a/artscraper/find_artists.py b/artscraper/find_artists.py
@@ -3,11 +3,14 @@
 '''
 
 import time
+
 from selenium import webdriver
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from webdriver_manager.firefox import GeckoDriverManager
+
 from artscraper.functions import random_wait_time
 
 def get_artist_links(webpage='https://artsandculture.google.com/category/artist',
-                     executable_path='geckodriver',
                      min_wait_time=5, output_file=None):
     '''
     Parameters
@@ -22,7 +25,7 @@ def get_artist_links(webpage='https://artsandculture.google.com/category/artist'
     '''
 
     # Launch Firefox browser
-    driver = webdriver.Firefox(executable_path=executable_path)
+    driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
 
     # Get Google Arts & Culture webpage listing all artists
     driver.get(webpage)

diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py
@@ -14,9 +14,13 @@
 
 import time
 import re
+from urllib.parse import unquote
+import json
 import requests
 
 from selenium import webdriver
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from webdriver_manager.firefox import GeckoDriverManager
 
 import wikipediaapi
 
@@ -28,13 +32,11 @@ class FindArtworks:
     given the link to their Google Arts & Culture webpage
     '''
 
-    def __init__(self, artist_link, executable_path='geckodriver',
+    def __init__(self, artist_link,
                  output_dir='./data', sparql_query= None, min_wait_time=5):
 
         # Link to artist's Google Arts & Culture webpage
         self.artist_link = artist_link
-        # Path to geckodriver
-        self.executable_path = executable_path
         # Directory to which the data is to be written
         # Create it if it doesn't exist
         Path(output_dir).mkdir(parents=True, exist_ok=True)
@@ -50,6 +52,7 @@ def __init__(self, artist_link, executable_path='geckodriver',
                 SELECT
                 ?familyName ?familyNameLabel
                 ?givenName ?givenNameLabel
+                ?pseudonym ?pseudonymLabel
                 ?sexOrGender ?sexOrGenderLabel
                 ?dateOfBirth ?dateOfBirthLabel
                 ?placeOfBirth ?placeOfBirthLabel
@@ -64,43 +67,46 @@ def __init__(self, artist_link, executable_path='geckodriver',
                 ?workLocation ?workLocationLabel
                 ?genre ?genreLabel
                 ?movement ?movementLabel
+                ?occupation ?occupationLabel
                 WHERE {
                   OPTIONAL { wd:person_id wdt:P734 ?familyName. }
                   OPTIONAL { wd:person_id wdt:P735 ?givenName. }
+                  OPTIONAL { wd:person_id wdt:P742 ?pseudonym. }
                   OPTIONAL { wd:person_id wdt:P21 ?sexOrGender. }
                   OPTIONAL {
                       wd:person_id wdt:P569 ?dateTimeOfBirth.
                       BIND (xsd:date(?dateTimeOfBirth) AS ?dateOfBirth)
                   }
-                  OPTIONAL { wd:person_id wdt:P19 ?placeOfBirth. }
-                  OPTIONAL {
-                    ?placeOfBirth wdt:P625 ?coordinatesBirth.
-                    BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth)
-                    BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth)
+                  OPTIONAL { 
+                      wd:person_id wdt:P19 ?placeOfBirth.
+                      ?placeOfBirth wdt:P625 ?coordinatesBirth.
+                      BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth)
+                      BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth)
                   }
                   OPTIONAL {
-                      wd:person_id wdt:P569 ?dateTimeOfDeath.
+                      wd:person_id wdt:P570 ?dateTimeOfDeath.
                       BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath)
                   }
-                  OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. }
-                  OPTIONAL {
-                    ?placeOfDeath wdt:P625 ?coordinatesDeath.
-                    BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath)
-                    BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath)
+                  OPTIONAL { 
+                      wd:person_id wdt:P20 ?placeOfDeath.
+                      ?placeOfDeath wdt:P625 ?coordinatesDeath.
+                      BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath)
+                      BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath)
                   }
                   OPTIONAL { wd:person_id wdt:P27 ?countryOfCitizenship. }
                   OPTIONAL { wd:person_id wdt:P551 ?residence. }
                   OPTIONAL { wd:person_id wdt:P937 ?workLocation. }
                   OPTIONAL { wd:person_id wdt:P136 ?genre. }
                   OPTIONAL { wd:person_id wdt:P135 ?movement. }
+                  OPTIONAL { wd:person_id wdt:P106 ?occupation. }
                   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
                 }
                 '''
         else:
             self.sparql_query = sparql_query
 
         # Open web browser
-        self.driver = webdriver.Firefox(executable_path=self.executable_path)
+        self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
 
 
     def __enter__(self):
@@ -145,7 +151,7 @@ def save_artist_information(self):
         # Filenames for artist's works, description, metadata
         artist_works_file = pathname_directory + '/' + 'works.txt'
         artist_description_file = pathname_directory + '/' + 'description.txt'
-        artist_metadata_file = pathname_directory + '/' + 'metadata.txt'
+        artist_metadata_file = pathname_directory + '/' + 'metadata.json'
 
         # Save artist's works, description, metadata
         with open(artist_works_file, 'w', encoding='utf-8') as file:
@@ -154,8 +160,7 @@ def save_artist_information(self):
         with open(artist_description_file, 'w', encoding='utf-8') as file:
             file.write(artist_description)
         with open(artist_metadata_file, 'w', encoding='utf-8') as file:
-            for key,value in artist_metadata.items():
-                file.write(f'{key} : {value}\n')
+            json.dump(artist_metadata, file, ensure_ascii=False)
 
 
     def get_artist_works(self):
@@ -182,13 +187,13 @@ def get_artist_works(self):
 
         # Check if right arrow button can still be clicked
         while right_arrow_element.get_attribute('tabindex') is not None:
+            # Wait for page to load
+            time.sleep(random_wait_time(min_wait=self.min_wait_time))
             # Find right arrow button
             right_arrow_element = parent_element.find_element('xpath', \
                 './/*[contains(@data-gaaction,"rightArrow")]')
             # Click on right arrow button
             self.driver.execute_script("arguments[0].click();", right_arrow_element)
-            # Wait for page to load
-            time.sleep(random_wait_time(min_wait=self.min_wait_time))
 
         # List of all elements with links to artworks
         elements = right_arrow_element.find_elements('xpath', \
@@ -218,6 +223,8 @@ def get_artist_description(self):
         # Get summary of the page (lead section of the Wikipedia article)
         description = page.summary
 
+        description = unquote(description)
+
         return description
 
     def get_artist_metadata(self):
@@ -239,7 +246,8 @@ def get_artist_metadata(self):
         query = self.sparql_query.replace('person_id', artist_id)
 
         # Send query request
-        request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=30)
+        request = requests.get(url, params={'format': 'json', \
+                            'query': ''.join(query)}, timeout=120)
 
         # Convert response to dictionary
         data = request.json()
@@ -292,6 +300,8 @@ def get_wikipedia_article_title(self):
         # Get title of artist's Wikipedia article
         title = wikipedia_link.rsplit('/')[-1]
 
+        title = unquote(title)
+
         return title
 
     def get_artist_wikidata_id(self):
@@ -340,6 +350,7 @@ def _get_property(self, data, query_property):
         if query_property+'Label' in data['results']['bindings'][0].keys():
             for element in data['results']['bindings']:
                 output_property = element[query_property+'Label']['value']
+                output_property = unquote(output_property)
                 # Avoid duplicates
                 if output_property not in output_property_list:
                     output_property_list.append(output_property)

diff --git a/artscraper/functions.py b/artscraper/functions.py
@@ -1,8 +1,13 @@
-'''
-random_wait_time: Function to determine a random wait time
-between two events
-'''
+"""
 
+Functions used repeatedly, and in many places:
+
+random_wait_time
+retry
+
+"""
+
+import time
 from random import random
 
 def random_wait_time(min_wait=5, max_wait=None):
@@ -26,6 +31,7 @@ def random_wait_time(min_wait=5, max_wait=None):
         Waiting time between `min_wait` and `max_wait` according to
         the polynomial PDF.
     """
+
     #  pylint: disable=invalid-name
     if max_wait is None:
         max_wait = 3 * min_wait
@@ -42,3 +48,33 @@ def inv_cdf(x):
         return (b**-beta - beta * x / a)**(-1 / beta)
 
     return inv_cdf(random())
+
+
+def retry(function, max_retries, min_wait_time, *args):
+    '''
+    Parameters
+    ----------
+    function: Function to run again
+    max_retries: Maximum number of times to retry
+    args: Arguments of the function
+
+    Returns
+    -------
+    Value returned by function, or prints an error message
+    '''
+
+    # Want to catch all kinds of exceptions
+    # pylint: disable=broad-except
+
+    num_attempt = 0
+    while num_attempt < max_retries:
+
+        try:
+            return function(*args)
+        except Exception as error:
+            print(f'Function {function} failed at attempt {num_attempt} \
+            with exception {repr(error)}')
+            time.sleep(random_wait_time(min_wait=min_wait_time))
+            num_attempt = num_attempt + 1
+
+    return None
diff --git a/artscraper/googleart.py b/artscraper/googleart.py
@@ -5,11 +5,14 @@
 from pathlib import Path
 from time import sleep
 from urllib.parse import urlparse
+from urllib.parse import unquote
 
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from webdriver_manager.firefox import GeckoDriverManager
 
 from artscraper.base import BaseArtScraper
 from artscraper.functions import random_wait_time
@@ -29,10 +32,10 @@ class GoogleArtScraper(BaseArtScraper):
         is randomly drawn from a polynomial distribution.
     """
 
-    def __init__(self, output_dir=None, skip_existing=True, min_wait=5,
-                 geckodriver_path="geckodriver"):
+    def __init__(self, output_dir=None, skip_existing=True, min_wait=5):
         super().__init__(output_dir, skip_existing, min_wait=min_wait)
-        self.driver = webdriver.Firefox(executable_path=geckodriver_path)
+
+        self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
         self.last_request = time.time() - 100
 
     def __exit__(self, _exc_type, _exc_val, _exc_tb):
@@ -46,7 +49,7 @@ def load_link(self, link):
         if self.output_dir is not None:
             if (self.paint_dir.is_dir() and self.skip_existing
                     and Path(self.paint_dir, "metadata.json").is_file()
-                    and Path(self.paint_dir, "painting.png").is_file()):
+                    and Path(self.paint_dir, "artwork.png").is_file()):
                 return False
             self.paint_dir.mkdir(exist_ok=True, parents=True)
 
@@ -57,6 +60,13 @@ def load_link(self, link):
     @property
     def paint_dir(self):
         paint_id = "_".join(urlparse(self.link).path.split("/")[-2:])
+
+        # Prevent problems with character encoding/decoding
+        paint_id = unquote(paint_id)
+        # Prevent problems with too-long file/directory names
+        if len(paint_id)>=256:
+            paint_id = paint_id[0:255]
+
         return Path(self.output_dir, paint_id)
 
     def wait(self, min_wait, max_wait=None, update=True):
@@ -97,7 +107,7 @@ def get_main_text(self):
         if elem.get_attribute("id").startswith("metadata-"):
             return ''
         inner_HTML = elem.get_attribute("innerHTML")
-        return BeautifulSoup(inner_HTML, features="html.parser").text
+        return unquote(BeautifulSoup(inner_HTML, features="html.parser").text)
 
     def _get_metadata(self):
         if self.output_dir is not None and self.meta_fp.is_file():
@@ -114,9 +124,11 @@ def _get_metadata(self):
         paragraph_HTML = soup.find_all("li")
         metadata = {}
         metadata["main_text"] = self.get_main_text()
+        metadata["main_text"] = unquote(metadata["main_text"])
         for par in paragraph_HTML:
             name = par.find("span", text=True).contents[0].lower()[:-1]
             metadata[name] = par.text[len(name) + 2:]
+            metadata[name] = unquote(metadata[name])
         metadata["id"] = paint_id
         return metadata
 
@@ -157,5 +169,19 @@ def save_image(self, img_fp=None, link=None):
         with open(img_fp, "wb") as f:
             f.write(self.get_image())
 
+    def save_artwork_information(self, link):
+        """
+        Given an artwork link, saves the image and the associated metadata.
+
+        Parameters
+        ----------
+        link: str
+            Artwork URL.
+
+        """
+        self.load_link(link)
+        self.save_metadata()
+        self.save_image()
+
     def close(self):
         self.driver.close()