Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SPARQL query timeout problems #22

Merged
merged 12 commits into from
Jul 10, 2023
2 changes: 1 addition & 1 deletion artscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Scrape art image and metadata from WikiArt and Google Arts."""

from artscraper.functions import random_wait_time
from artscraper.functions import random_wait_time, retry
from artscraper.googleart import GoogleArtScraper
from artscraper.wikiart import WikiArtScraper
from artscraper.find_artworks import FindArtworks
Expand Down
5 changes: 3 additions & 2 deletions artscraper/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,9 @@ def save_metadata(self, meta_fp=None):
return
metadata = self.get_metadata()
self.paint_dir.mkdir(exist_ok=True)
with open(meta_fp, "w", encoding="utf-8") as f: # pylint: disable=invalid-name
json.dump(metadata, f)
with open(meta_fp, "w", encoding='utf-8') as f:
# pylint: disable=invalid-name
json.dump(metadata, f, ensure_ascii=False)

@abstractmethod
def save_image(self, img_fp=None, link=None):
Expand Down
7 changes: 5 additions & 2 deletions artscraper/find_artists.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
'''

import time

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager

from artscraper.functions import random_wait_time

def get_artist_links(webpage='https://artsandculture.google.com/category/artist',
executable_path='geckodriver',
min_wait_time=5, output_file=None):
'''
Parameters
Expand All @@ -22,7 +25,7 @@ def get_artist_links(webpage='https://artsandculture.google.com/category/artist'
'''

# Launch Firefox browser
driver = webdriver.Firefox(executable_path=executable_path)
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))

# Get Google Arts & Culture webpage listing all artists
driver.get(webpage)
Expand Down
53 changes: 32 additions & 21 deletions artscraper/find_artworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@

import time
import re
from urllib.parse import unquote
import json
import requests

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager

import wikipediaapi

Expand All @@ -28,13 +32,11 @@ class FindArtworks:
given the link to their Google Arts & Culture webpage
'''

def __init__(self, artist_link, executable_path='geckodriver',
def __init__(self, artist_link,
output_dir='./data', sparql_query= None, min_wait_time=5):

# Link to artist's Google Arts & Culture webpage
self.artist_link = artist_link
# Path to geckodriver
self.executable_path = executable_path
# Directory to which the data is to be written
# Create it if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)
Expand All @@ -50,6 +52,7 @@ def __init__(self, artist_link, executable_path='geckodriver',
SELECT
?familyName ?familyNameLabel
?givenName ?givenNameLabel
?pseudonym ?pseudonymLabel
?sexOrGender ?sexOrGenderLabel
?dateOfBirth ?dateOfBirthLabel
?placeOfBirth ?placeOfBirthLabel
Expand All @@ -64,43 +67,46 @@ def __init__(self, artist_link, executable_path='geckodriver',
?workLocation ?workLocationLabel
?genre ?genreLabel
?movement ?movementLabel
?occupation ?occupationLabel
WHERE {
OPTIONAL { wd:person_id wdt:P734 ?familyName. }
OPTIONAL { wd:person_id wdt:P735 ?givenName. }
OPTIONAL { wd:person_id wdt:P742 ?pseudonym. }
OPTIONAL { wd:person_id wdt:P21 ?sexOrGender. }
OPTIONAL {
wd:person_id wdt:P569 ?dateTimeOfBirth.
BIND (xsd:date(?dateTimeOfBirth) AS ?dateOfBirth)
}
OPTIONAL { wd:person_id wdt:P19 ?placeOfBirth. }
OPTIONAL {
?placeOfBirth wdt:P625 ?coordinatesBirth.
BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth)
BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth)
OPTIONAL {
wd:person_id wdt:P19 ?placeOfBirth.
?placeOfBirth wdt:P625 ?coordinatesBirth.
BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth)
BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth)
}
OPTIONAL {
wd:person_id wdt:P569 ?dateTimeOfDeath.
wd:person_id wdt:P570 ?dateTimeOfDeath.
BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath)
}
OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. }
OPTIONAL {
?placeOfDeath wdt:P625 ?coordinatesDeath.
BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath)
BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath)
OPTIONAL {
wd:person_id wdt:P20 ?placeOfDeath.
?placeOfDeath wdt:P625 ?coordinatesDeath.
BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath)
BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath)
}
OPTIONAL { wd:person_id wdt:P27 ?countryOfCitizenship. }
OPTIONAL { wd:person_id wdt:P551 ?residence. }
OPTIONAL { wd:person_id wdt:P937 ?workLocation. }
OPTIONAL { wd:person_id wdt:P136 ?genre. }
OPTIONAL { wd:person_id wdt:P135 ?movement. }
OPTIONAL { wd:person_id wdt:P106 ?occupation. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
'''
else:
self.sparql_query = sparql_query

# Open web browser
self.driver = webdriver.Firefox(executable_path=self.executable_path)
self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))


def __enter__(self):
Expand Down Expand Up @@ -145,7 +151,7 @@ def save_artist_information(self):
# Filenames for artist's works, description, metadata
artist_works_file = pathname_directory + '/' + 'works.txt'
artist_description_file = pathname_directory + '/' + 'description.txt'
artist_metadata_file = pathname_directory + '/' + 'metadata.txt'
artist_metadata_file = pathname_directory + '/' + 'metadata.json'

# Save artist's works, description, metadata
with open(artist_works_file, 'w', encoding='utf-8') as file:
Expand All @@ -154,8 +160,7 @@ def save_artist_information(self):
with open(artist_description_file, 'w', encoding='utf-8') as file:
file.write(artist_description)
with open(artist_metadata_file, 'w', encoding='utf-8') as file:
for key,value in artist_metadata.items():
file.write(f'{key} : {value}\n')
json.dump(artist_metadata, file, ensure_ascii=False)


def get_artist_works(self):
Expand All @@ -182,13 +187,13 @@ def get_artist_works(self):

# Check if right arrow button can still be clicked
while right_arrow_element.get_attribute('tabindex') is not None:
# Wait for page to load
time.sleep(random_wait_time(min_wait=self.min_wait_time))
# Find right arrow button
right_arrow_element = parent_element.find_element('xpath', \
'.//*[contains(@data-gaaction,"rightArrow")]')
# Click on right arrow button
self.driver.execute_script("arguments[0].click();", right_arrow_element)
# Wait for page to load
time.sleep(random_wait_time(min_wait=self.min_wait_time))

# List of all elements with links to artworks
elements = right_arrow_element.find_elements('xpath', \
Expand Down Expand Up @@ -218,6 +223,8 @@ def get_artist_description(self):
# Get summary of the page (lead section of the Wikipedia article)
description = page.summary

description = unquote(description)

return description

def get_artist_metadata(self):
Expand All @@ -239,7 +246,8 @@ def get_artist_metadata(self):
query = self.sparql_query.replace('person_id', artist_id)

# Send query request
request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=30)
request = requests.get(url, params={'format': 'json', \
'query': ''.join(query)}, timeout=120)

# Convert response to dictionary
data = request.json()
Expand Down Expand Up @@ -292,6 +300,8 @@ def get_wikipedia_article_title(self):
# Get title of artist's Wikipedia article
title = wikipedia_link.rsplit('/')[-1]

title = unquote(title)

return title

def get_artist_wikidata_id(self):
Expand Down Expand Up @@ -340,6 +350,7 @@ def _get_property(self, data, query_property):
if query_property+'Label' in data['results']['bindings'][0].keys():
for element in data['results']['bindings']:
output_property = element[query_property+'Label']['value']
output_property = unquote(output_property)
# Avoid duplicates
if output_property not in output_property_list:
output_property_list.append(output_property)
Expand Down
44 changes: 40 additions & 4 deletions artscraper/functions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
'''
random_wait_time: Function to determine a random wait time
between two events
'''
"""

Functions used repeatedly, and in many places:

random_wait_time
retry

"""

import time
from random import random

def random_wait_time(min_wait=5, max_wait=None):
Expand All @@ -26,6 +31,7 @@ def random_wait_time(min_wait=5, max_wait=None):
Waiting time between `min_wait` and `max_wait` according to
the polynomial PDF.
"""

# pylint: disable=invalid-name
if max_wait is None:
max_wait = 3 * min_wait
Expand All @@ -42,3 +48,33 @@ def inv_cdf(x):
return (b**-beta - beta * x / a)**(-1 / beta)

return inv_cdf(random())


def retry(function, max_retries, min_wait_time, *args):
'''
Parameters
----------
function: Function to run again
max_retries: Maximum number of times to retry
args: Arguments of the function

Returns
-------
Value returned by function, or prints an error message
'''

# Want to catch all kinds of exceptions
# pylint: disable=broad-except

num_attempt = 0
while num_attempt < max_retries:

try:
return function(*args)
except Exception as error:
print(f'Function {function} failed at attempt {num_attempt} \
with exception {repr(error)}')
time.sleep(random_wait_time(min_wait=min_wait_time))
num_attempt = num_attempt + 1

return None
36 changes: 31 additions & 5 deletions artscraper/googleart.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
from pathlib import Path
from time import sleep
from urllib.parse import urlparse
from urllib.parse import unquote

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager

from artscraper.base import BaseArtScraper
from artscraper.functions import random_wait_time
Expand All @@ -29,10 +32,10 @@ class GoogleArtScraper(BaseArtScraper):
is randomly drawn from a polynomial distribution.
"""

def __init__(self, output_dir=None, skip_existing=True, min_wait=5,
geckodriver_path="geckodriver"):
def __init__(self, output_dir=None, skip_existing=True, min_wait=5):
super().__init__(output_dir, skip_existing, min_wait=min_wait)
self.driver = webdriver.Firefox(executable_path=geckodriver_path)

self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
self.last_request = time.time() - 100

def __exit__(self, _exc_type, _exc_val, _exc_tb):
Expand All @@ -46,7 +49,7 @@ def load_link(self, link):
if self.output_dir is not None:
if (self.paint_dir.is_dir() and self.skip_existing
and Path(self.paint_dir, "metadata.json").is_file()
and Path(self.paint_dir, "painting.png").is_file()):
and Path(self.paint_dir, "artwork.png").is_file()):
return False
self.paint_dir.mkdir(exist_ok=True, parents=True)

Expand All @@ -57,6 +60,13 @@ def load_link(self, link):
@property
def paint_dir(self):
paint_id = "_".join(urlparse(self.link).path.split("/")[-2:])

# Prevent problems with character encoding/decoding
paint_id = unquote(paint_id)
# Prevent problems with too-long file/directory names
if len(paint_id)>=256:
paint_id = paint_id[0:255]

return Path(self.output_dir, paint_id)

def wait(self, min_wait, max_wait=None, update=True):
Expand Down Expand Up @@ -97,7 +107,7 @@ def get_main_text(self):
if elem.get_attribute("id").startswith("metadata-"):
return ''
inner_HTML = elem.get_attribute("innerHTML")
return BeautifulSoup(inner_HTML, features="html.parser").text
return unquote(BeautifulSoup(inner_HTML, features="html.parser").text)

def _get_metadata(self):
if self.output_dir is not None and self.meta_fp.is_file():
Expand All @@ -114,9 +124,11 @@ def _get_metadata(self):
paragraph_HTML = soup.find_all("li")
metadata = {}
metadata["main_text"] = self.get_main_text()
metadata["main_text"] = unquote(metadata["main_text"])
for par in paragraph_HTML:
name = par.find("span", text=True).contents[0].lower()[:-1]
metadata[name] = par.text[len(name) + 2:]
metadata[name] = unquote(metadata[name])
metadata["id"] = paint_id
return metadata

Expand Down Expand Up @@ -157,5 +169,19 @@ def save_image(self, img_fp=None, link=None):
with open(img_fp, "wb") as f:
f.write(self.get_image())

def save_artwork_information(self, link):
"""
Given an artwork link, saves the image and the associated metadata.

Parameters
----------
link: str
Artwork URL.

"""
self.load_link(link)
self.save_metadata()
self.save_image()

def close(self):
self.driver.close()
Loading