From 1a88b390722e7b98e32a2b4376574b37b08a63ee Mon Sep 17 00:00:00 2001 From: modhurita Date: Tue, 20 Jun 2023 13:37:43 +0200 Subject: [PATCH 01/12] add basic error handling and retrying --- artscraper/find_artworks.py | 2 +- ...all_artworks_error_handling_retrying.ipynb | 1072 +++++++++++++++++ 2 files changed, 1073 insertions(+), 1 deletion(-) create mode 100644 examples/example_collect_all_artworks_error_handling_retrying.ipynb diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 9127379..09f07d6 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -239,7 +239,7 @@ def get_artist_metadata(self): query = self.sparql_query.replace('person_id', artist_id) # Send query request - request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=30) + request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=120) # Convert response to dictionary data = request.json() diff --git a/examples/example_collect_all_artworks_error_handling_retrying.ipynb b/examples/example_collect_all_artworks_error_handling_retrying.ipynb new file mode 100644 index 0000000..7c98337 --- /dev/null +++ b/examples/example_collect_all_artworks_error_handling_retrying.ipynb @@ -0,0 +1,1072 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b42eac8", + "metadata": {}, + "source": [ + "# Find links to Google Arts & Culture webpages of all artists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "229a67a1", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import get_artist_links" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77cd3e7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get links for all artists, as a list\n", + "#artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')" + ] + }, + { + "cell_type": "markdown", + "id": "d83d0691", + "metadata": {}, + "source": [ + "# Collect artworks and metadata for all artists" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "194ac6af", + "metadata": {}, + "outputs": [], + "source": [ + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8583c7c0", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c7c7aed1", + "metadata": {}, + "outputs": [], + "source": [ + "min_wait_time = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c3a16846", + "metadata": {}, + "outputs": [], + "source": [ + "# Subset of artist links, for illustration purposes\n", + "artist_urls = ['https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2',\n", + " 'https://artsandculture.google.com/entity/claude-monet/m01xnj',\n", + " 'https://artsandculture.google.com/entity/banksy/m023b7b',\n", + " 'https://artsandculture.google.com/entity/rembrandt/m0bskv2',\n", + " 'https://artsandculture.google.com/entity/raphael/m0c43g']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43ae9afa", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory in which the data is to be stored\n", + "output_dir = './data'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c34d9016", + "metadata": {}, + "outputs": [], + "source": [ + "# Maximum number of attempts to perform a task \n", + "max_retries = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad0f8897", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artist URL: https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2\n", + "artwork URL: https://artsandculture.google.com/asset/undergrowth-with-two-figures-vincent-van-gogh-dutch-b-1853-d-1890/PgGaehoXTiERQQ\n", + "artwork URL: https://artsandculture.google.com/asset/head-of-a-skeleton-with-a-burning-cigarette-vincent-van-gogh/hQGZa2I9Xi6lpA\n", + "artwork URL: https://artsandculture.google.com/asset/the-starry-night-vincent-van-gogh/bgEuwDxel93-Pg\n", + "artwork URL: https://artsandculture.google.com/asset/self-portrait/9gFw_1Vou2CkwQ\n", + "artwork URL: https://artsandculture.google.com/asset/country-road-in-provence-by-night-vincent-van-gogh/4wEXP9j2v6hpYw\n", + "artwork URL: https://artsandculture.google.com/asset/almond-blossom-vincent-van-gogh/dAFXSL9sZ1ulDw\n", + "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-crows-vincent-van-gogh/dwFdD5AMQfpSew\n", + "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/UQGFh2ps12F5hw\n", + "artwork URL: https://artsandculture.google.com/asset/the-yellow-house-the-street-vincent-van-gogh/4gEx_EL470OSUw\n", + "artwork URL: https://artsandculture.google.com/asset/the-potato-eaters-vincent-van-gogh/7gFcKarE9QeaXw\n", + "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/hwEGmsM-FoHAwA\n", + "artwork URL: https://artsandculture.google.com/asset/starry-night/uQE3XORhSK37Dw\n", + "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh/KwF-AdF1REQl6w\n", + "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-a-straw-hat-obverse-the-potato-peeler-vincent-van-gogh/zQGHorekyP-67w\n", + "artwork URL: https://artsandculture.google.com/asset/cypresses-vincent-van-gogh/zwEHXljwy3BidA\n", + "artwork URL: https://artsandculture.google.com/asset/van-gogh-s-bedroom-in-arles/kQEugEsNjGDZfw\n", + "artwork URL: https://artsandculture.google.com/asset/the-church-in-auvers-sur-oise-view-from-the-chevet/6wEjLceQPXkTtA\n", + "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/DgElRwoxZWloQQ\n", + "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-grey-felt-hat-vincent-van-gogh/PgEJ1hPIzqsM2w\n", + "artwork URL: https://artsandculture.google.com/asset/sorrowing-old-man-at-eternity-s-gate-vincent-van-gogh/ywEJUSEHQmoNYw\n", + "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh-dutch-1853-1890/rgHdFPzCeCfnxQ\n", + "artwork URL: https://artsandculture.google.com/asset/wheatfield-under-thunderclouds-vincent-van-gogh/kAErTfh0dORNwQ\n", + "artwork URL: https://artsandculture.google.com/asset/irises-vincent-van-gogh/ZQH2h7PBY47yXQ\n", + "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-a-reaper-vincent-van-gogh/BgFGcS3ucZqeRA\n", + "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/XwHuufJZYFvUnA\n", + "artwork URL: https://artsandculture.google.com/asset/postman-joseph-roulin-vincent-van-gogh/nwEw_d8jN9jVbw\n", + "artwork URL: https://artsandculture.google.com/asset/the-harvest-vincent-van-gogh/UAEejbUbf7fwSg\n", + "artwork URL: https://artsandculture.google.com/asset/first-steps-after-millet-vincent-van-gogh/jAE8KAdj05Buug\n", + "artwork URL: https://artsandculture.google.com/asset/green-field-vincent-van-gogh/EQF2FvGUZzLKOA\n", + "Error at attempt 0: Message: Unable to locate element: /html/body/div[3]/div[3]/div/div/div[2]/div[3]\n", + "Stacktrace:\n", + "RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8\n", + "WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:187:5\n", + "NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:505:5\n", + "element.find/ Date: Tue, 27 Jun 2023 13:22:49 +0200 Subject: [PATCH 02/12] add retry functionality --- artscraper/__init__.py | 2 +- artscraper/find_artworks.py | 16 +++++++++------- artscraper/functions.py | 31 +++++++++++++++++++++++++++++-- artscraper/googleart.py | 13 +++++++------ 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/artscraper/__init__.py b/artscraper/__init__.py index eb815c5..776d896 100644 --- a/artscraper/__init__.py +++ b/artscraper/__init__.py @@ -1,6 +1,6 @@ """Scrape art image and metadata from WikiArt and Google Arts.""" -from artscraper.functions import random_wait_time +from artscraper.functions import random_wait_time, retry from artscraper.googleart import GoogleArtScraper from artscraper.wikiart import WikiArtScraper from artscraper.find_artworks import FindArtworks diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 09f07d6..f1a465a 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -20,7 +20,7 @@ import wikipediaapi -from artscraper.functions import random_wait_time +from artscraper.functions import random_wait_time, retry class FindArtworks: ''' @@ -29,7 +29,7 @@ class FindArtworks: ''' def __init__(self, artist_link, executable_path='geckodriver', - output_dir='./data', sparql_query= None, min_wait_time=5): + output_dir='./data', sparql_query= None, min_wait_time=5, max_retries=10): # Link to artist's Google Arts & Culture webpage self.artist_link = artist_link @@ -42,7 +42,9 @@ def __init__(self, artist_link, executable_path='geckodriver', self.output_dir = output_dir # Minimum wait time between two clicks while scrolling a webpage self.min_wait_time = min_wait_time - + # Total number of attempts at executing a function before giving up + self.max_retries = max_retries + # SPARQL query to fetch metadata from wikidata if sparql_query is None: # Default SPARQL query @@ -120,10 +122,10 @@ def get_artist_information(self): artist_works, artist_description, artist_metadata : All information about the artist ''' - - artist_works = self.get_artist_works() - artist_description = self.get_artist_description() - artist_metadata = self.get_artist_metadata() + + artist_works = retry(self.get_artist_works, self.max_retries, self.min_wait_time) + artist_description = retry(self.get_artist_description, self.max_retries, self.min_wait_time) + artist_metadata = retry(self.get_artist_metadata, self.max_retries, self.min_wait_time) return artist_works, artist_description, artist_metadata diff --git a/artscraper/functions.py b/artscraper/functions.py index 82bbd41..16d6b6f 100644 --- a/artscraper/functions.py +++ b/artscraper/functions.py @@ -1,10 +1,11 @@ +import time +from random import random + ''' random_wait_time: Function to determine a random wait time between two events ''' -from random import random - def random_wait_time(min_wait=5, max_wait=None): """Compute a random wait time. @@ -42,3 +43,29 @@ def inv_cdf(x): return (b**-beta - beta * x / a)**(-1 / beta) return inv_cdf(random()) + + +def retry(function, max_retries=10, min_wait_time=10, *args): + ''' + Parameters + ---------- + function: Function to run again + max_retries: Maximum number of times to retry + args: Arguments of the function + + Returns + ------- + Value returned by function, or prints an error message + ''' + + num_attempt = 0 + while num_attempt < max_retries: + + try: + return function(*args) + except Exception as e: + print(f'Function {function} failed at attempt {num_attempt} with exception {repr(e)}: {str(e)}') + time.sleep(random_wait_time(min_wait=min_wait_time)) + num_attempt = num_attempt + 1 + + return None diff --git a/artscraper/googleart.py b/artscraper/googleart.py index e648bcd..b134521 100755 --- a/artscraper/googleart.py +++ b/artscraper/googleart.py @@ -12,7 +12,7 @@ from selenium.webdriver.common.keys import Keys from artscraper.base import BaseArtScraper -from artscraper.functions import random_wait_time +from artscraper.functions import random_wait_time, retry class GoogleArtScraper(BaseArtScraper): """Class for scraping GoogleArt images. @@ -30,8 +30,8 @@ class GoogleArtScraper(BaseArtScraper): """ def __init__(self, output_dir=None, skip_existing=True, min_wait=5, - geckodriver_path="geckodriver"): - super().__init__(output_dir, skip_existing, min_wait=min_wait) + geckodriver_path="geckodriver", max_retries=10): + super().__init__(output_dir, skip_existing, min_wait=min_wait, max_retries=max_retries) self.driver = webdriver.Firefox(executable_path=geckodriver_path) self.last_request = time.time() - 100 @@ -46,12 +46,13 @@ def load_link(self, link): if self.output_dir is not None: if (self.paint_dir.is_dir() and self.skip_existing and Path(self.paint_dir, "metadata.json").is_file() - and Path(self.paint_dir, "painting.png").is_file()): + and Path(self.paint_dir, "artwork.png").is_file()): return False self.paint_dir.mkdir(exist_ok=True, parents=True) self.wait(self.min_wait) - self.driver.get(link) + #self.driver.get(link) + retry(self.driver.get, self.max_retries, link) return True @property @@ -107,7 +108,7 @@ def _get_metadata(self): paint_id = urlparse(self.link).path.split("/")[-1] self.wait(self.min_wait, update=False) - elem = self.driver.find_element("xpath", f'//*[@id="metadata-{paint_id}"]') + elem = retry(self.driver.find_element, self.max_retries, self.min_wait_time, "xpath", f'//*[@id="metadata-{paint_id}"]') inner_HTML = elem.get_attribute("innerHTML") soup = BeautifulSoup(inner_HTML, features="html.parser") From 3720d13dc19161e1ca54dcdd2d3921a2565ff644 Mon Sep 17 00:00:00 2001 From: modhurita Date: Wed, 28 Jun 2023 12:19:23 +0200 Subject: [PATCH 03/12] fix long filename error, write metadata to json file --- artscraper/find_artworks.py | 29 +- artscraper/functions.py | 3 +- artscraper/googleart.py | 22 +- ...all_artworks_error_handling_retrying.ipynb | 1220 ++++------------- 4 files changed, 285 insertions(+), 989 deletions(-) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index f1a465a..68deaeb 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -15,12 +15,13 @@ import time import re import requests +import json from selenium import webdriver import wikipediaapi -from artscraper.functions import random_wait_time, retry +from artscraper.functions import random_wait_time class FindArtworks: ''' @@ -29,7 +30,7 @@ class FindArtworks: ''' def __init__(self, artist_link, executable_path='geckodriver', - output_dir='./data', sparql_query= None, min_wait_time=5, max_retries=10): + output_dir='./data', sparql_query= None, min_wait_time=5): # Link to artist's Google Arts & Culture webpage self.artist_link = artist_link @@ -42,9 +43,7 @@ def __init__(self, artist_link, executable_path='geckodriver', self.output_dir = output_dir # Minimum wait time between two clicks while scrolling a webpage self.min_wait_time = min_wait_time - # Total number of attempts at executing a function before giving up - self.max_retries = max_retries - + # SPARQL query to fetch metadata from wikidata if sparql_query is None: # Default SPARQL query @@ -66,6 +65,7 @@ def __init__(self, artist_link, executable_path='geckodriver', ?workLocation ?workLocationLabel ?genre ?genreLabel ?movement ?movementLabel + ?occupation ?occupationLabel WHERE { OPTIONAL { wd:person_id wdt:P734 ?familyName. } OPTIONAL { wd:person_id wdt:P735 ?givenName. } @@ -95,6 +95,7 @@ def __init__(self, artist_link, executable_path='geckodriver', OPTIONAL { wd:person_id wdt:P937 ?workLocation. } OPTIONAL { wd:person_id wdt:P136 ?genre. } OPTIONAL { wd:person_id wdt:P135 ?movement. } + OPTIONAL { wd:person_id wdt:P106 ?occupation. } SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } ''' @@ -122,10 +123,10 @@ def get_artist_information(self): artist_works, artist_description, artist_metadata : All information about the artist ''' - - artist_works = retry(self.get_artist_works, self.max_retries, self.min_wait_time) - artist_description = retry(self.get_artist_description, self.max_retries, self.min_wait_time) - artist_metadata = retry(self.get_artist_metadata, self.max_retries, self.min_wait_time) + + artist_works = self.get_artist_works() + artist_description = self.get_artist_description() + artist_metadata = self.get_artist_metadata() return artist_works, artist_description, artist_metadata @@ -147,7 +148,7 @@ def save_artist_information(self): # Filenames for artist's works, description, metadata artist_works_file = pathname_directory + '/' + 'works.txt' artist_description_file = pathname_directory + '/' + 'description.txt' - artist_metadata_file = pathname_directory + '/' + 'metadata.txt' + artist_metadata_file = pathname_directory + '/' + 'metadata.json' # Save artist's works, description, metadata with open(artist_works_file, 'w', encoding='utf-8') as file: @@ -156,8 +157,9 @@ def save_artist_information(self): with open(artist_description_file, 'w', encoding='utf-8') as file: file.write(artist_description) with open(artist_metadata_file, 'w', encoding='utf-8') as file: - for key,value in artist_metadata.items(): - file.write(f'{key} : {value}\n') + #for key,value in artist_metadata.items(): + #file.write(f'{key} : {value}\n') + json.dump(artist_metadata, file) def get_artist_works(self): @@ -184,13 +186,14 @@ def get_artist_works(self): # Check if right arrow button can still be clicked while right_arrow_element.get_attribute('tabindex') is not None: + time.sleep(random_wait_time(min_wait=self.min_wait_time)) # Find right arrow button right_arrow_element = parent_element.find_element('xpath', \ './/*[contains(@data-gaaction,"rightArrow")]') # Click on right arrow button self.driver.execute_script("arguments[0].click();", right_arrow_element) # Wait for page to load - time.sleep(random_wait_time(min_wait=self.min_wait_time)) + #time.sleep(random_wait_time(min_wait=self.min_wait_time)) # List of all elements with links to artworks elements = right_arrow_element.find_elements('xpath', \ diff --git a/artscraper/functions.py b/artscraper/functions.py index 16d6b6f..84d0778 100644 --- a/artscraper/functions.py +++ b/artscraper/functions.py @@ -64,7 +64,8 @@ def retry(function, max_retries=10, min_wait_time=10, *args): try: return function(*args) except Exception as e: - print(f'Function {function} failed at attempt {num_attempt} with exception {repr(e)}: {str(e)}') + #print(f'Function {function} failed at attempt {num_attempt} with exception {repr(e)}: {str(e)}') + print(f'Function {function} failed at attempt {num_attempt} with exception {repr(e)}') time.sleep(random_wait_time(min_wait=min_wait_time)) num_attempt = num_attempt + 1 diff --git a/artscraper/googleart.py b/artscraper/googleart.py index b134521..0364135 100755 --- a/artscraper/googleart.py +++ b/artscraper/googleart.py @@ -12,7 +12,7 @@ from selenium.webdriver.common.keys import Keys from artscraper.base import BaseArtScraper -from artscraper.functions import random_wait_time, retry +from artscraper.functions import random_wait_time class GoogleArtScraper(BaseArtScraper): """Class for scraping GoogleArt images. @@ -30,8 +30,8 @@ class GoogleArtScraper(BaseArtScraper): """ def __init__(self, output_dir=None, skip_existing=True, min_wait=5, - geckodriver_path="geckodriver", max_retries=10): - super().__init__(output_dir, skip_existing, min_wait=min_wait, max_retries=max_retries) + geckodriver_path="geckodriver"): + super().__init__(output_dir, skip_existing, min_wait=min_wait) self.driver = webdriver.Firefox(executable_path=geckodriver_path) self.last_request = time.time() - 100 @@ -43,7 +43,7 @@ def load_link(self, link): return False self.link = link - if self.output_dir is not None: + if self.output_dir is not None: if (self.paint_dir.is_dir() and self.skip_existing and Path(self.paint_dir, "metadata.json").is_file() and Path(self.paint_dir, "artwork.png").is_file()): @@ -51,13 +51,16 @@ def load_link(self, link): self.paint_dir.mkdir(exist_ok=True, parents=True) self.wait(self.min_wait) - #self.driver.get(link) - retry(self.driver.get, self.max_retries, link) + self.driver.get(link) return True @property def paint_dir(self): paint_id = "_".join(urlparse(self.link).path.split("/")[-2:]) + + # Prevent problems with too-long file/directory names + paint_id = paint_id[0:255] + return Path(self.output_dir, paint_id) def wait(self, min_wait, max_wait=None, update=True): @@ -108,7 +111,7 @@ def _get_metadata(self): paint_id = urlparse(self.link).path.split("/")[-1] self.wait(self.min_wait, update=False) - elem = retry(self.driver.find_element, self.max_retries, self.min_wait_time, "xpath", f'//*[@id="metadata-{paint_id}"]') + elem = self.driver.find_element("xpath", f'//*[@id="metadata-{paint_id}"]') inner_HTML = elem.get_attribute("innerHTML") soup = BeautifulSoup(inner_HTML, features="html.parser") @@ -158,5 +161,10 @@ def save_image(self, img_fp=None, link=None): with open(img_fp, "wb") as f: f.write(self.get_image()) + def save_artwork_information(self, link): + self.load_link(link) + self.save_metadata() + self.save_image() + def close(self): self.driver.close() diff --git a/examples/example_collect_all_artworks_error_handling_retrying.ipynb b/examples/example_collect_all_artworks_error_handling_retrying.ipynb index 7c98337..8569920 100644 --- a/examples/example_collect_all_artworks_error_handling_retrying.ipynb +++ b/examples/example_collect_all_artworks_error_handling_retrying.ipynb @@ -2,1049 +2,333 @@ "cells": [ { "cell_type": "markdown", - "id": "9b42eac8", + "id": "d83d0691", + "metadata": {}, + "source": [ + "# Collect artworks and metadata for all artists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "194ac6af", + "metadata": {}, + "outputs": [], + "source": [ + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8583c7c0", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c34d9016", "metadata": {}, + "outputs": [], "source": [ - "# Find links to Google Arts & Culture webpages of all artists" + "# Maximum number of attempts to perform a task \n", + "max_retries = 3" ] }, { "cell_type": "code", "execution_count": null, - "id": "229a67a1", + "id": "c7c7aed1", "metadata": {}, "outputs": [], "source": [ - "from artscraper import get_artist_links" + "min_wait_time = 10" ] }, { "cell_type": "code", "execution_count": null, - "id": "77cd3e7d", + "id": "8d21abe7", "metadata": {}, "outputs": [], "source": [ - "# Get links for all artists, as a list\n", - "#artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')" + "# Artist Clementine Hunter, 27 artworks\n", + "artist_urls = ['https://artsandculture.google.com/entity/clementine-hunter/m0d1k7n']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43ae9afa", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory in which the data is to be stored\n", + "output_dir = './data'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb623d6b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time \n", + "\n", + "# Find_artworks for each artist\n", + "for artist_url in artist_urls:\n", + " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", + " min_wait_time=min_wait_time) as scraper:\n", + " # Save list of works, description, and metadata for an artist\n", + " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", + " # Create directory for this artist\n", + " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", + " # Get list of links to this artist's works \n", + " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", + " artwork_links = [line.rstrip() for line in file] \n", + " # Scrape artworks\n", + " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", + " # Go through each artwork link\n", + " for url in artwork_links:\n", + " print(f'artwork URL: {url}')\n", + " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaf2acf7", + "metadata": {}, + "outputs": [], + "source": [ + "len(artwork_links)" ] }, { "cell_type": "markdown", - "id": "d83d0691", + "id": "3519fb2e", "metadata": {}, "source": [ - "# Collect artworks and metadata for all artists" + "# Display results" + ] + }, + { + "cell_type": "markdown", + "id": "f8cfeece", + "metadata": {}, + "source": [ + "## Display contents of data directory" ] }, { "cell_type": "code", - "execution_count": 1, - "id": "194ac6af", + "execution_count": null, + "id": "9f56346a", "metadata": {}, "outputs": [], "source": [ - "import time" + "!ls data" + ] + }, + { + "cell_type": "markdown", + "id": "eff822f0", + "metadata": {}, + "source": [ + "## Display contents of directory for one artist" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "8583c7c0", + "execution_count": null, + "id": "fc4d3d90", "metadata": {}, "outputs": [], "source": [ - "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time" + "!ls ./data/Clementine_Hunter" + ] + }, + { + "cell_type": "markdown", + "id": "e0921cb7", + "metadata": {}, + "source": [ + "## Description of artist" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "c7c7aed1", + "execution_count": null, + "id": "38079197", "metadata": {}, "outputs": [], "source": [ - "min_wait_time = 10" + "!cat ./data/Clementine_Hunter/description.txt" + ] + }, + { + "cell_type": "markdown", + "id": "322e9c5b", + "metadata": {}, + "source": [ + "## Metadata of artist" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "c3a16846", + "execution_count": null, + "id": "2e5ef192", "metadata": {}, "outputs": [], "source": [ - "# Subset of artist links, for illustration purposes\n", - "artist_urls = ['https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2',\n", - " 'https://artsandculture.google.com/entity/claude-monet/m01xnj',\n", - " 'https://artsandculture.google.com/entity/banksy/m023b7b',\n", - " 'https://artsandculture.google.com/entity/rembrandt/m0bskv2',\n", - " 'https://artsandculture.google.com/entity/raphael/m0c43g']" + "!cat ./data/Clementine_Hunter/metadata.json" + ] + }, + { + "cell_type": "markdown", + "id": "63251f32", + "metadata": {}, + "source": [ + "## Directory containing works of this artist" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "43ae9afa", + "execution_count": null, + "id": "1cd0d995", "metadata": {}, "outputs": [], "source": [ - "# Directory in which the data is to be stored\n", - "output_dir = './data'" + "!ls ./data/Clementine_Hunter/works" + ] + }, + { + "cell_type": "markdown", + "id": "4c20d8c2", + "metadata": {}, + "source": [ + "## Directory containing one artwork by this artist" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "c34d9016", + "execution_count": null, + "id": "256919d3", "metadata": {}, "outputs": [], "source": [ - "# Maximum number of attempts to perform a task \n", - "max_retries = 10" + "!ls ./data/Clementine_Hunter/works/flowers-clementine-hunter_zQERekxk8d_F8g" + ] + }, + { + "cell_type": "markdown", + "id": "6829e0a2", + "metadata": {}, + "source": [ + "## Display metadata for this artwork" ] }, { "cell_type": "code", "execution_count": null, - "id": "ad0f8897", + "id": "b5504ef7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artist URL: https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2\n", - "artwork URL: https://artsandculture.google.com/asset/undergrowth-with-two-figures-vincent-van-gogh-dutch-b-1853-d-1890/PgGaehoXTiERQQ\n", - "artwork URL: https://artsandculture.google.com/asset/head-of-a-skeleton-with-a-burning-cigarette-vincent-van-gogh/hQGZa2I9Xi6lpA\n", - "artwork URL: https://artsandculture.google.com/asset/the-starry-night-vincent-van-gogh/bgEuwDxel93-Pg\n", - "artwork URL: https://artsandculture.google.com/asset/self-portrait/9gFw_1Vou2CkwQ\n", - "artwork URL: https://artsandculture.google.com/asset/country-road-in-provence-by-night-vincent-van-gogh/4wEXP9j2v6hpYw\n", - "artwork URL: https://artsandculture.google.com/asset/almond-blossom-vincent-van-gogh/dAFXSL9sZ1ulDw\n", - "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-crows-vincent-van-gogh/dwFdD5AMQfpSew\n", - "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/UQGFh2ps12F5hw\n", - "artwork URL: https://artsandculture.google.com/asset/the-yellow-house-the-street-vincent-van-gogh/4gEx_EL470OSUw\n", - "artwork URL: https://artsandculture.google.com/asset/the-potato-eaters-vincent-van-gogh/7gFcKarE9QeaXw\n", - "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/hwEGmsM-FoHAwA\n", - "artwork URL: https://artsandculture.google.com/asset/starry-night/uQE3XORhSK37Dw\n", - "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh/KwF-AdF1REQl6w\n", - "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-a-straw-hat-obverse-the-potato-peeler-vincent-van-gogh/zQGHorekyP-67w\n", - "artwork URL: https://artsandculture.google.com/asset/cypresses-vincent-van-gogh/zwEHXljwy3BidA\n", - "artwork URL: https://artsandculture.google.com/asset/van-gogh-s-bedroom-in-arles/kQEugEsNjGDZfw\n", - "artwork URL: https://artsandculture.google.com/asset/the-church-in-auvers-sur-oise-view-from-the-chevet/6wEjLceQPXkTtA\n", - "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/DgElRwoxZWloQQ\n", - "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-grey-felt-hat-vincent-van-gogh/PgEJ1hPIzqsM2w\n", - "artwork URL: https://artsandculture.google.com/asset/sorrowing-old-man-at-eternity-s-gate-vincent-van-gogh/ywEJUSEHQmoNYw\n", - "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh-dutch-1853-1890/rgHdFPzCeCfnxQ\n", - "artwork URL: https://artsandculture.google.com/asset/wheatfield-under-thunderclouds-vincent-van-gogh/kAErTfh0dORNwQ\n", - "artwork URL: https://artsandculture.google.com/asset/irises-vincent-van-gogh/ZQH2h7PBY47yXQ\n", - "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-a-reaper-vincent-van-gogh/BgFGcS3ucZqeRA\n", - "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/XwHuufJZYFvUnA\n", - "artwork URL: https://artsandculture.google.com/asset/postman-joseph-roulin-vincent-van-gogh/nwEw_d8jN9jVbw\n", - "artwork URL: https://artsandculture.google.com/asset/the-harvest-vincent-van-gogh/UAEejbUbf7fwSg\n", - "artwork URL: https://artsandculture.google.com/asset/first-steps-after-millet-vincent-van-gogh/jAE8KAdj05Buug\n", - "artwork URL: https://artsandculture.google.com/asset/green-field-vincent-van-gogh/EQF2FvGUZzLKOA\n", - "Error at attempt 0: Message: Unable to locate element: /html/body/div[3]/div[3]/div/div/div[2]/div[3]\n", - "Stacktrace:\n", - "RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8\n", - "WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:187:5\n", - "NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:505:5\n", - "element.find/ Date: Wed, 28 Jun 2023 14:29:11 +0200 Subject: [PATCH 04/12] refine error handling and retrying --- ...all_artworks-error-handling-retrying.ipynb | 3046 +++++++++++++++++ 1 file changed, 3046 insertions(+) create mode 100644 examples/example_collect_all_artworks-error-handling-retrying.ipynb diff --git a/examples/example_collect_all_artworks-error-handling-retrying.ipynb b/examples/example_collect_all_artworks-error-handling-retrying.ipynb new file mode 100644 index 0000000..8ac0a43 --- /dev/null +++ b/examples/example_collect_all_artworks-error-handling-retrying.ipynb @@ -0,0 +1,3046 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b42eac8", + "metadata": {}, + "source": [ + "# Find links to Google Arts & Culture webpages of all artists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "229a67a1", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import get_artist_links" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77cd3e7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get links for all artists, as a list\n", + "#artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')" + ] + }, + { + "cell_type": "markdown", + "id": "d83d0691", + "metadata": {}, + "source": [ + "# Collect artworks and metadata for all artists" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "194ac6af", + "metadata": {}, + "outputs": [], + "source": [ + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8583c7c0", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c7c7aed1", + "metadata": {}, + "outputs": [], + "source": [ + "min_wait_time = 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c3a16846", + "metadata": {}, + "outputs": [], + "source": [ + "# Subset of artist links, for illustration purposes\n", + "artist_urls = ['https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2',\n", + " 'https://artsandculture.google.com/entity/claude-monet/m01xnj',\n", + " 'https://artsandculture.google.com/entity/banksy/m023b7b',\n", + " 'https://artsandculture.google.com/entity/rembrandt/m0bskv2',\n", + " 'https://artsandculture.google.com/entity/raphael/m0c43g']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "50989f25", + "metadata": {}, + "outputs": [], + "source": [ + "artist_urls = ['https://artsandculture.google.com/entity/mary-louise-mclaughlin/m02x3qks']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "43ae9afa", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory in which the data is to be stored\n", + "output_dir = './data_retry'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c34d9016", + "metadata": {}, + "outputs": [], + "source": [ + "# Maximum number of attempts to perform a task \n", + "max_retries = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "227a6e45", + "metadata": {}, + "outputs": [ + { + "ename": "JSONDecodeError", + "evalue": "Expecting value: line 1 column 1 (char 0)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 971\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcomplexjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 972\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mJSONDecodeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.9/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 345\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.9/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 336\u001b[0m \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.9/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 356\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", + "\u001b[0;32m~/ResearchEngineering/artscraper/artscraper/find_artworks.py\u001b[0m in \u001b[0;36msave_artist_information\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 136\u001b[0m '''\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 138\u001b[0;31m \u001b[0martist_works\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_description\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_metadata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_information\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0martist_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_wikipedia_article_title\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/ResearchEngineering/artscraper/artscraper/find_artworks.py\u001b[0m in \u001b[0;36mget_artist_information\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0martist_works\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_works\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0martist_description\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_description\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 126\u001b[0;31m \u001b[0martist_metadata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_metadata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0martist_works\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_description\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_metadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/ResearchEngineering/artscraper/artscraper/find_artworks.py\u001b[0m in \u001b[0;36mget_artist_metadata\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;31m# Convert response to dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# Extract properties searched by the SPARQL query\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 973\u001b[0m \u001b[0;31m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[0;31m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 975\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRequestsJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 976\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 977\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "# Find_artworks for each artist\n", + "for artist_url in artist_urls:\n", + " with FindArtworks(artist_link=artist_url, output_dir=output_dir, min_wait_time=10) as scraper:\n", + " # Save list of works, description, and metadata for an artist\n", + " scraper.save_artist_information()\n", + " # Get list of links to this artist's works \n", + " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", + " artwork_links = [line.rstrip() for line in file]\n", + " # Create directory for this artist\n", + " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", + " # Scrape artworks\n", + " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=10) as subscraper:\n", + " # Go through each artwork link\n", + " for url in artwork_links:\n", + " subscraper.load_link(url)\n", + " subscraper.save_metadata()\n", + " subscraper.save_image()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e68f43d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ad0f8897", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artist URL: https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2\n", + "artwork URL: https://artsandculture.google.com/asset/undergrowth-with-two-figures-vincent-van-gogh-dutch-b-1853-d-1890/PgGaehoXTiERQQ\n", + "artwork URL: https://artsandculture.google.com/asset/head-of-a-skeleton-with-a-burning-cigarette-vincent-van-gogh/hQGZa2I9Xi6lpA\n", + "artwork URL: https://artsandculture.google.com/asset/the-starry-night-vincent-van-gogh/bgEuwDxel93-Pg\n", + "artwork URL: https://artsandculture.google.com/asset/self-portrait/9gFw_1Vou2CkwQ\n", + "artwork URL: https://artsandculture.google.com/asset/country-road-in-provence-by-night-vincent-van-gogh/4wEXP9j2v6hpYw\n", + "artwork URL: https://artsandculture.google.com/asset/almond-blossom-vincent-van-gogh/dAFXSL9sZ1ulDw\n", + "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-crows-vincent-van-gogh/dwFdD5AMQfpSew\n", + "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/UQGFh2ps12F5hw\n", + "artwork URL: https://artsandculture.google.com/asset/the-yellow-house-the-street-vincent-van-gogh/4gEx_EL470OSUw\n", + "artwork URL: https://artsandculture.google.com/asset/the-potato-eaters-vincent-van-gogh/7gFcKarE9QeaXw\n", + "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/hwEGmsM-FoHAwA\n", + "artwork URL: https://artsandculture.google.com/asset/starry-night/uQE3XORhSK37Dw\n", + "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh/KwF-AdF1REQl6w\n", + "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-a-straw-hat-obverse-the-potato-peeler-vincent-van-gogh/zQGHorekyP-67w\n", + "artwork URL: https://artsandculture.google.com/asset/cypresses-vincent-van-gogh/zwEHXljwy3BidA\n", + "artwork URL: https://artsandculture.google.com/asset/van-gogh-s-bedroom-in-arles/kQEugEsNjGDZfw\n", + "artwork URL: https://artsandculture.google.com/asset/the-church-in-auvers-sur-oise-view-from-the-chevet/6wEjLceQPXkTtA\n", + "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/DgElRwoxZWloQQ\n", + "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-grey-felt-hat-vincent-van-gogh/PgEJ1hPIzqsM2w\n", + "artwork URL: https://artsandculture.google.com/asset/sorrowing-old-man-at-eternity-s-gate-vincent-van-gogh/ywEJUSEHQmoNYw\n", + "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh-dutch-1853-1890/rgHdFPzCeCfnxQ\n", + "artwork URL: https://artsandculture.google.com/asset/wheatfield-under-thunderclouds-vincent-van-gogh/kAErTfh0dORNwQ\n", + "artwork URL: https://artsandculture.google.com/asset/irises-vincent-van-gogh/ZQH2h7PBY47yXQ\n", + "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-a-reaper-vincent-van-gogh/BgFGcS3ucZqeRA\n", + "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/XwHuufJZYFvUnA\n", + "artwork URL: https://artsandculture.google.com/asset/postman-joseph-roulin-vincent-van-gogh/nwEw_d8jN9jVbw\n", + "artwork URL: https://artsandculture.google.com/asset/the-harvest-vincent-van-gogh/UAEejbUbf7fwSg\n", + "artwork URL: https://artsandculture.google.com/asset/first-steps-after-millet-vincent-van-gogh/jAE8KAdj05Buug\n", + "artwork URL: https://artsandculture.google.com/asset/green-field-vincent-van-gogh/EQF2FvGUZzLKOA\n", + "Error at attempt 0: Message: Unable to locate element: /html/body/div[3]/div[3]/div/div/div[2]/div[3]\n", + "Stacktrace:\n", + "RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8\n", + "WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:187:5\n", + "NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:505:5\n", + "element.find/ Date: Thu, 29 Jun 2023 13:59:29 +0200 Subject: [PATCH 05/12] create clean notebook --- ...all_artworks_error_handling_retrying.ipynb | 227 ++++++++++++++---- 1 file changed, 185 insertions(+), 42 deletions(-) diff --git a/examples/example_collect_all_artworks_error_handling_retrying.ipynb b/examples/example_collect_all_artworks_error_handling_retrying.ipynb index 8569920..23e21a6 100644 --- a/examples/example_collect_all_artworks_error_handling_retrying.ipynb +++ b/examples/example_collect_all_artworks_error_handling_retrying.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "194ac6af", "metadata": {}, "outputs": [], @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "8583c7c0", "metadata": {}, "outputs": [], @@ -30,18 +30,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "c34d9016", "metadata": {}, "outputs": [], "source": [ "# Maximum number of attempts to perform a task \n", - "max_retries = 3" + "max_retries = 10" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "c7c7aed1", "metadata": {}, "outputs": [], @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "8d21abe7", "metadata": {}, "outputs": [], @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "43ae9afa", "metadata": {}, "outputs": [], @@ -73,12 +73,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "eb623d6b", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork URL: https://artsandculture.google.com/asset/quilt-clementine-hunter/_gFkai2V-4ydag\n", + "artwork URL: https://artsandculture.google.com/asset/zinnias-in-a-pot-clementine-hunter/mwGD6E7Ek5acqQ\n", + "artwork URL: https://artsandculture.google.com/asset/melrose-quilt-clementine-hunter/zAEML8E_JHdZBw\n", + "artwork URL: https://artsandculture.google.com/asset/funeral-procession-clementine-hunter/3gHi9tMtAF2big\n", + "artwork URL: https://artsandculture.google.com/asset/chevron-quilt-clementine-hunter/UQGTB4IChKZ6Qw\n", + "artwork URL: https://artsandculture.google.com/asset/chaleur-the-sun-gives-life-to-everything-clementine-hunter/HwGBfvookbPqkQ\n", + "artwork URL: https://artsandculture.google.com/asset/flowers-clementine-hunter/zQERekxk8d_F8g\n", + "artwork URL: https://artsandculture.google.com/asset/crucifixion-with-angel-clementine-hunter/MwHjeBEoiNhbbQ\n", + "artwork URL: https://artsandculture.google.com/asset/cooking-out-clementine-hunter/rQGtvTjBIYIJ6Q\n", + "artwork URL: https://artsandculture.google.com/asset/panorama-of-baptism-on-cane-river-clementine-hunter/EwGOfLBe5vUg2g\n", + "artwork URL: https://artsandculture.google.com/asset/floral-mosaic-5-clementine-hunter/zQGnBVJlybWfzw\n", + "artwork URL: https://artsandculture.google.com/asset/chickens-clementine-hunter/0wEvGq3AgMynow\n", + "artwork URL: https://artsandculture.google.com/asset/birds-and-flowers-clementine-hunter/OAHgv0AFrobJPQ\n", + "artwork URL: https://artsandculture.google.com/asset/minding-baby-clementine-hunter/FQGa7FzDuGM0cg\n", + "artwork URL: https://artsandculture.google.com/asset/fall-fireworks-clementine-hunter/FwEDCbEb6A9hig\n", + "artwork URL: https://artsandculture.google.com/asset/farmlands-clementine-hunter/UwGchbC0dry5DA\n", + "artwork URL: https://artsandculture.google.com/asset/catus-in-a-red-bowl-clementine-hunter/cAH94kfbGdPPqQ\n", + "artwork URL: https://artsandculture.google.com/asset/flowers-in-a-jar-clementine-hunter/3gHpffPjlfmLQA\n", + "artwork URL: https://artsandculture.google.com/asset/street-of-the-neighborhood-clementine-hunter/zAG27Fcfy4v7AQ\n", + "artwork URL: https://artsandculture.google.com/asset/fish-in-the-ocean-clementine-hunter/_QFdEcDCjuN2Vg\n", + "artwork URL: https://artsandculture.google.com/asset/village-no-1-clementine-hunter/XgEyBhu4t7gXZQ\n", + "artwork URL: https://artsandculture.google.com/asset/fish-bowl-clementine-hunter/WQFNS6_tEf2jjg\n", + "artwork URL: https://artsandculture.google.com/asset/quilt-no-2-clementine-hunter/FAH2dn3Eh8QR_Q\n", + "artwork URL: https://artsandculture.google.com/asset/circus-clementine-hunter/AgHnJrgLFpQuCg\n", + "artwork URL: https://artsandculture.google.com/asset/quilt-clementine-hunter/_gFkai2V-4ydag\n", + "artwork URL: https://artsandculture.google.com/asset/feeding-birds-clementine-hunter/lwHj85ayu4zyBA\n", + "artwork URL: https://artsandculture.google.com/asset/wash-day-clementine-hunter/rgENCOZdm4aAKw\n", + "artwork URL: https://artsandculture.google.com/asset/birds-clementine-hunter/OQGQcPFtMUbT5Q\n", + "CPU times: user 725 ms, sys: 129 ms, total: 854 ms\n", + "Wall time: 35min\n" + ] + } + ], "source": [ "%%time \n", "\n", @@ -101,16 +138,6 @@ " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "aaf2acf7", - "metadata": {}, - "outputs": [], - "source": [ - "len(artwork_links)" - ] - }, { "cell_type": "markdown", "id": "3519fb2e", @@ -129,10 +156,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "9f56346a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A._Y._Jackson Clementine_Hunter Hokusai\r\n" + ] + } + ], "source": [ "!ls data" ] @@ -147,10 +182,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "fc4d3d90", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "description.txt metadata.json\tworks works.txt\r\n" + ] + } + ], "source": [ "!ls ./data/Clementine_Hunter" ] @@ -165,10 +208,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "38079197", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clementine Hunter (pronounced Clementeen; late December 1886 or early January 1887 – January 1, 1988) was a self-taught Black folk artist from the Cane River region of Louisiana, who lived and worked on Melrose Plantation.\r\n", + "Hunter was born into a Louisiana Creole family at Hidden Hill Plantation near Cloutierville, in Natchitoches Parish, Louisiana. She started working as a farm laborer when young, and never learned to read or write. In her fifties, she began to sell her paintings, which soon gained local and national attention for their complexity in depicting Black Southern life in the early 20th century.\r\n", + "Initially she sold her first paintings for as little as 25 cents. But by the end of her life, her work was being exhibited in museums and sold by dealers for thousands of dollars. Clementine Hunter produced an estimated 5,000 to 10,000 paintings in her lifetime. Hunter was granted an honorary Doctor of Fine Arts degree by Northwestern State University of Louisiana in 1986, and she is the first African-American artist to have a solo exhibition at the present-day New Orleans Museum of Art. In 2013, director Robert Wilson presented a new opera about her, entitled Zinnias: the Life of Clementine Hunter, at Montclair State University in New Jersey." + ] + } + ], "source": [ "!cat ./data/Clementine_Hunter/description.txt" ] @@ -183,10 +236,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "2e5ef192", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"family name\": \"Hunter\", \"given name\": [\"Clementine\", \"Clementina\"], \"sex or gender\": \"female\", \"date of birth\": [\"1889-01-01\", \"1886-01-01\"], \"place of birth\": [\"Cloutierville\", \"Louisiana\"], \"latitude of place of birth\": [\"31.5433\", \"31.0\"], \"longitude of place of birth\": [\"-92.9183\", \"-92.0\"], \"date of death\": [\"1889-01-01\", \"1886-01-01\"], \"place of death\": [\"Natchitoches\", \"Natchitoches Parish\"], \"latitude of place of death\": [\"31.7431\", \"31.73\"], \"longitude of place of death\": [\"-93.095\", \"-93.1\"], \"country of citizenship\": \"United States of America\", \"residence\": \"\", \"work location\": \"\", \"genre\": \"portrait\", \"movement\": \"\", \"occupation\": [\"artist\", \"painter\"]}" + ] + } + ], "source": [ "!cat ./data/Clementine_Hunter/metadata.json" ] @@ -201,10 +262,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "1cd0d995", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "birds-and-flowers-clementine-hunter_OAHgv0AFrobJPQ\r\n", + "birds-clementine-hunter_OQGQcPFtMUbT5Q\r\n", + "catus-in-a-red-bowl-clementine-hunter_cAH94kfbGdPPqQ\r\n", + "chaleur-the-sun-gives-life-to-everything-clementine-hunter_HwGBfvookbPqkQ\r\n", + "chevron-quilt-clementine-hunter_UQGTB4IChKZ6Qw\r\n", + "chickens-clementine-hunter_0wEvGq3AgMynow\r\n", + "circus-clementine-hunter_AgHnJrgLFpQuCg\r\n", + "cooking-out-clementine-hunter_rQGtvTjBIYIJ6Q\r\n", + "crucifixion-with-angel-clementine-hunter_MwHjeBEoiNhbbQ\r\n", + "fall-fireworks-clementine-hunter_FwEDCbEb6A9hig\r\n", + "farmlands-clementine-hunter_UwGchbC0dry5DA\r\n", + "feeding-birds-clementine-hunter_lwHj85ayu4zyBA\r\n", + "fish-bowl-clementine-hunter_WQFNS6_tEf2jjg\r\n", + "fish-in-the-ocean-clementine-hunter__QFdEcDCjuN2Vg\r\n", + "floral-mosaic-5-clementine-hunter_zQGnBVJlybWfzw\r\n", + "flowers-clementine-hunter_zQERekxk8d_F8g\r\n", + "flowers-in-a-jar-clementine-hunter_3gHpffPjlfmLQA\r\n", + "funeral-procession-clementine-hunter_3gHi9tMtAF2big\r\n", + "melrose-quilt-clementine-hunter_zAEML8E_JHdZBw\r\n", + "minding-baby-clementine-hunter_FQGa7FzDuGM0cg\r\n", + "panorama-of-baptism-on-cane-river-clementine-hunter_EwGOfLBe5vUg2g\r\n", + "quilt-clementine-hunter__gFkai2V-4ydag\r\n", + "quilt-no-2-clementine-hunter_FAH2dn3Eh8QR_Q\r\n", + "street-of-the-neighborhood-clementine-hunter_zAG27Fcfy4v7AQ\r\n", + "village-no-1-clementine-hunter_XgEyBhu4t7gXZQ\r\n", + "wash-day-clementine-hunter_rgENCOZdm4aAKw\r\n", + "zinnias-in-a-pot-clementine-hunter_mwGD6E7Ek5acqQ\r\n" + ] + } + ], "source": [ "!ls ./data/Clementine_Hunter/works" ] @@ -219,10 +314,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "256919d3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork.png metadata.json\r\n" + ] + } + ], "source": [ "!ls ./data/Clementine_Hunter/works/flowers-clementine-hunter_zQERekxk8d_F8g" ] @@ -237,10 +340,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "b5504ef7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"main_text\": \"Oil on canvas still life painting of a vase of flowers. The brightly colored orange vase has a handle on the right side. It contains green foliage and red, yellow, orange, and white flowers. The background is a dark brownish black.\", \"title\": \"Flowers\", \"creator\": \"Clementine Hunter\", \"date created\": \"1973\", \"id\": \"zQERekxk8d_F8g\", \"link\": \"https://artsandculture.google.com/asset/flowers-clementine-hunter/zQERekxk8d_F8g\"}" + ] + } + ], "source": [ "!cat ./data/Clementine_Hunter/works/flowers-clementine-hunter_zQERekxk8d_F8g/metadata.json" ] @@ -255,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "c783a9ad", "metadata": {}, "outputs": [], @@ -265,10 +376,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "e38b6ff7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork URL: https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\n" + ] + } + ], "source": [ "with GoogleArtScraper(output_dir + '/' + 'Hokusai' + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", " # Go through each artwork link\n", @@ -287,10 +406,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "5bd7654a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai_hg\r\n" + ] + } + ], "source": [ "!ls ./data/Hokusai/works" ] @@ -305,10 +432,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "7a85d111", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork.png metadata.json\r\n" + ] + } + ], "source": [ "!ls ./data/Hokusai/works/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai_hg" ] @@ -323,10 +458,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "2e5d6e6d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"main_text\": \"\", \"title\": \"\\u51a8\\u5dbd\\u4e09\\u5341\\u516d\\u666f\\u3000\\u7532\\u5dde\\u77f3\\u73ed\\u6ca2|Kajikazawa in Kai Province (K\\u014dsh\\u016b Kajikazawa), from the series Thirty-six Views of Mount Fuji (Fugaku sanj\\u016brokkei)\", \"creator\": \"Katsushika Hokusai\", \"date created\": \"ca. 1830\\u201332\", \"physical dimensions\": \"10 1/4 x 15 1/8 in. (26 x 38.4 cm)\", \"type\": \"Woodblock print\", \"external link\": \" http://www.metmuseum.org/art/collection/search/39800\", \"medium\": \"Polychrome woodblock print; ink and color on paper\", \"repository\": \"Metropolitan Museum of Art, New York, NY\", \"period\": \"Edo period (1615\\u20131868)\", \"culture\": \"Japan\", \"id\": \"hgHQaDeXBcllwg\", \"link\": \"https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\"}" + ] + } + ], "source": [ "!cat ./data/Hokusai/works/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai_hg/metadata.json" ] From cd78643a7e847415d1f89f25313bc87417fec133 Mon Sep 17 00:00:00 2001 From: modhurita Date: Thu, 29 Jun 2023 14:18:37 +0200 Subject: [PATCH 06/12] remove unnecessary file --- ...all_artworks-error-handling-retrying.ipynb | 3046 ----------------- 1 file changed, 3046 deletions(-) delete mode 100644 examples/example_collect_all_artworks-error-handling-retrying.ipynb diff --git a/examples/example_collect_all_artworks-error-handling-retrying.ipynb b/examples/example_collect_all_artworks-error-handling-retrying.ipynb deleted file mode 100644 index 8ac0a43..0000000 --- a/examples/example_collect_all_artworks-error-handling-retrying.ipynb +++ /dev/null @@ -1,3046 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9b42eac8", - "metadata": {}, - "source": [ - "# Find links to Google Arts & Culture webpages of all artists" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "229a67a1", - "metadata": {}, - "outputs": [], - "source": [ - "from artscraper import get_artist_links" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77cd3e7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Get links for all artists, as a list\n", - "#artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')" - ] - }, - { - "cell_type": "markdown", - "id": "d83d0691", - "metadata": {}, - "source": [ - "# Collect artworks and metadata for all artists" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "194ac6af", - "metadata": {}, - "outputs": [], - "source": [ - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8583c7c0", - "metadata": {}, - "outputs": [], - "source": [ - "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c7c7aed1", - "metadata": {}, - "outputs": [], - "source": [ - "min_wait_time = 2" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c3a16846", - "metadata": {}, - "outputs": [], - "source": [ - "# Subset of artist links, for illustration purposes\n", - "artist_urls = ['https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2',\n", - " 'https://artsandculture.google.com/entity/claude-monet/m01xnj',\n", - " 'https://artsandculture.google.com/entity/banksy/m023b7b',\n", - " 'https://artsandculture.google.com/entity/rembrandt/m0bskv2',\n", - " 'https://artsandculture.google.com/entity/raphael/m0c43g']" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "50989f25", - "metadata": {}, - "outputs": [], - "source": [ - "artist_urls = ['https://artsandculture.google.com/entity/mary-louise-mclaughlin/m02x3qks']" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "43ae9afa", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory in which the data is to be stored\n", - "output_dir = './data_retry'" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c34d9016", - "metadata": {}, - "outputs": [], - "source": [ - "# Maximum number of attempts to perform a task \n", - "max_retries = 3" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "227a6e45", - "metadata": {}, - "outputs": [ - { - "ename": "JSONDecodeError", - "evalue": "Expecting value: line 1 column 1 (char 0)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 971\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcomplexjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 972\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mJSONDecodeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 345\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 336\u001b[0m \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 356\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", - "\u001b[0;32m~/ResearchEngineering/artscraper/artscraper/find_artworks.py\u001b[0m in \u001b[0;36msave_artist_information\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 136\u001b[0m '''\n\u001b[1;32m 137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 138\u001b[0;31m \u001b[0martist_works\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_description\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_metadata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_information\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 139\u001b[0m \u001b[0martist_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_wikipedia_article_title\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/ResearchEngineering/artscraper/artscraper/find_artworks.py\u001b[0m in \u001b[0;36mget_artist_information\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0martist_works\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_works\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0martist_description\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_description\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 126\u001b[0;31m \u001b[0martist_metadata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_artist_metadata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 127\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0martist_works\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_description\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martist_metadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/ResearchEngineering/artscraper/artscraper/find_artworks.py\u001b[0m in \u001b[0;36mget_artist_metadata\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;31m# Convert response to dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# Extract properties searched by the SPARQL query\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.9/site-packages/requests/models.py\u001b[0m in \u001b[0;36mjson\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 973\u001b[0m \u001b[0;31m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[0;31m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 975\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRequestsJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 976\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 977\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "# Find_artworks for each artist\n", - "for artist_url in artist_urls:\n", - " with FindArtworks(artist_link=artist_url, output_dir=output_dir, min_wait_time=10) as scraper:\n", - " # Save list of works, description, and metadata for an artist\n", - " scraper.save_artist_information()\n", - " # Get list of links to this artist's works \n", - " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", - " artwork_links = [line.rstrip() for line in file]\n", - " # Create directory for this artist\n", - " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", - " # Scrape artworks\n", - " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=10) as subscraper:\n", - " # Go through each artwork link\n", - " for url in artwork_links:\n", - " subscraper.load_link(url)\n", - " subscraper.save_metadata()\n", - " subscraper.save_image()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e68f43d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ad0f8897", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artist URL: https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2\n", - "artwork URL: https://artsandculture.google.com/asset/undergrowth-with-two-figures-vincent-van-gogh-dutch-b-1853-d-1890/PgGaehoXTiERQQ\n", - "artwork URL: https://artsandculture.google.com/asset/head-of-a-skeleton-with-a-burning-cigarette-vincent-van-gogh/hQGZa2I9Xi6lpA\n", - "artwork URL: https://artsandculture.google.com/asset/the-starry-night-vincent-van-gogh/bgEuwDxel93-Pg\n", - "artwork URL: https://artsandculture.google.com/asset/self-portrait/9gFw_1Vou2CkwQ\n", - "artwork URL: https://artsandculture.google.com/asset/country-road-in-provence-by-night-vincent-van-gogh/4wEXP9j2v6hpYw\n", - "artwork URL: https://artsandculture.google.com/asset/almond-blossom-vincent-van-gogh/dAFXSL9sZ1ulDw\n", - "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-crows-vincent-van-gogh/dwFdD5AMQfpSew\n", - "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/UQGFh2ps12F5hw\n", - "artwork URL: https://artsandculture.google.com/asset/the-yellow-house-the-street-vincent-van-gogh/4gEx_EL470OSUw\n", - "artwork URL: https://artsandculture.google.com/asset/the-potato-eaters-vincent-van-gogh/7gFcKarE9QeaXw\n", - "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/hwEGmsM-FoHAwA\n", - "artwork URL: https://artsandculture.google.com/asset/starry-night/uQE3XORhSK37Dw\n", - "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh/KwF-AdF1REQl6w\n", - "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-a-straw-hat-obverse-the-potato-peeler-vincent-van-gogh/zQGHorekyP-67w\n", - "artwork URL: https://artsandculture.google.com/asset/cypresses-vincent-van-gogh/zwEHXljwy3BidA\n", - "artwork URL: https://artsandculture.google.com/asset/van-gogh-s-bedroom-in-arles/kQEugEsNjGDZfw\n", - "artwork URL: https://artsandculture.google.com/asset/the-church-in-auvers-sur-oise-view-from-the-chevet/6wEjLceQPXkTtA\n", - "artwork URL: https://artsandculture.google.com/asset/roses-vincent-van-gogh/DgElRwoxZWloQQ\n", - "artwork URL: https://artsandculture.google.com/asset/self-portrait-with-grey-felt-hat-vincent-van-gogh/PgEJ1hPIzqsM2w\n", - "artwork URL: https://artsandculture.google.com/asset/sorrowing-old-man-at-eternity-s-gate-vincent-van-gogh/ywEJUSEHQmoNYw\n", - "artwork URL: https://artsandculture.google.com/asset/the-bedroom-vincent-van-gogh-dutch-1853-1890/rgHdFPzCeCfnxQ\n", - "artwork URL: https://artsandculture.google.com/asset/wheatfield-under-thunderclouds-vincent-van-gogh/kAErTfh0dORNwQ\n", - "artwork URL: https://artsandculture.google.com/asset/irises-vincent-van-gogh/ZQH2h7PBY47yXQ\n", - "artwork URL: https://artsandculture.google.com/asset/wheatfield-with-a-reaper-vincent-van-gogh/BgFGcS3ucZqeRA\n", - "artwork URL: https://artsandculture.google.com/asset/sunflowers-vincent-van-gogh/XwHuufJZYFvUnA\n", - "artwork URL: https://artsandculture.google.com/asset/postman-joseph-roulin-vincent-van-gogh/nwEw_d8jN9jVbw\n", - "artwork URL: https://artsandculture.google.com/asset/the-harvest-vincent-van-gogh/UAEejbUbf7fwSg\n", - "artwork URL: https://artsandculture.google.com/asset/first-steps-after-millet-vincent-van-gogh/jAE8KAdj05Buug\n", - "artwork URL: https://artsandculture.google.com/asset/green-field-vincent-van-gogh/EQF2FvGUZzLKOA\n", - "Error at attempt 0: Message: Unable to locate element: /html/body/div[3]/div[3]/div/div/div[2]/div[3]\n", - "Stacktrace:\n", - "RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8\n", - "WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:187:5\n", - "NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:505:5\n", - "element.find/ Date: Tue, 4 Jul 2023 14:12:00 +0200 Subject: [PATCH 07/12] ensure non-ASCII characters are correctly decoded from URL and text is correctly written to metadata --- artscraper/base.py | 4 ++-- artscraper/googleart.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/artscraper/base.py b/artscraper/base.py index 3dad232..571cba2 100644 --- a/artscraper/base.py +++ b/artscraper/base.py @@ -141,8 +141,8 @@ def save_metadata(self, meta_fp=None): return metadata = self.get_metadata() self.paint_dir.mkdir(exist_ok=True) - with open(meta_fp, "w", encoding="utf-8") as f: # pylint: disable=invalid-name - json.dump(metadata, f) + with open(meta_fp, "w") as f: + json.dump(metadata, f, ensure_ascii=False).encode("utf8") @abstractmethod def save_image(self, img_fp=None, link=None): diff --git a/artscraper/googleart.py b/artscraper/googleart.py index 0364135..c6def72 100755 --- a/artscraper/googleart.py +++ b/artscraper/googleart.py @@ -5,6 +5,7 @@ from pathlib import Path from time import sleep from urllib.parse import urlparse +from urllib.parse import unquote from bs4 import BeautifulSoup from selenium import webdriver @@ -58,8 +59,11 @@ def load_link(self, link): def paint_dir(self): paint_id = "_".join(urlparse(self.link).path.split("/")[-2:]) + # Prevent problems with character encoding/decoding + paint_id = unquote(paint_id) # Prevent problems with too-long file/directory names - paint_id = paint_id[0:255] + if len(paint_id)>=256: + paint_id = paint_id[0:255] return Path(self.output_dir, paint_id) @@ -101,7 +105,7 @@ def get_main_text(self): if elem.get_attribute("id").startswith("metadata-"): return '' inner_HTML = elem.get_attribute("innerHTML") - return BeautifulSoup(inner_HTML, features="html.parser").text + return unquote(BeautifulSoup(inner_HTML, features="html.parser").text) def _get_metadata(self): if self.output_dir is not None and self.meta_fp.is_file(): @@ -118,9 +122,11 @@ def _get_metadata(self): paragraph_HTML = soup.find_all("li") metadata = {} metadata["main_text"] = self.get_main_text() + metadata["main_text"] = unquote(metadata["main_text"]) for par in paragraph_HTML: name = par.find("span", text=True).contents[0].lower()[:-1] metadata[name] = par.text[len(name) + 2:] + metadata[name] = unquote(metadata[name]) metadata["id"] = paint_id return metadata From 09c747145fdf89b33bfc86ed55f29b972b4929a6 Mon Sep 17 00:00:00 2001 From: modhurita Date: Tue, 4 Jul 2023 15:37:44 +0200 Subject: [PATCH 08/12] make sure non-ascii characters are displayed properly --- artscraper/base.py | 2 +- artscraper/find_artworks.py | 18 +- ...le_collect_all_artworks_url_decoding.ipynb | 457 ++++++++++++++++++ 3 files changed, 469 insertions(+), 8 deletions(-) create mode 100644 examples/example_collect_all_artworks_url_decoding.ipynb diff --git a/artscraper/base.py b/artscraper/base.py index 571cba2..8f7b07a 100644 --- a/artscraper/base.py +++ b/artscraper/base.py @@ -142,7 +142,7 @@ def save_metadata(self, meta_fp=None): metadata = self.get_metadata() self.paint_dir.mkdir(exist_ok=True) with open(meta_fp, "w") as f: - json.dump(metadata, f, ensure_ascii=False).encode("utf8") + json.dump(metadata, f, ensure_ascii=False) @abstractmethod def save_image(self, img_fp=None, link=None): diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 68deaeb..84292ac 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -16,6 +16,7 @@ import re import requests import json +from urllib.parse import unquote from selenium import webdriver @@ -81,7 +82,7 @@ def __init__(self, artist_link, executable_path='geckodriver', BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth) } OPTIONAL { - wd:person_id wdt:P569 ?dateTimeOfDeath. + wd:person_id wdt:P570 ?dateTimeOfDeath. BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath) } OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. } @@ -157,9 +158,7 @@ def save_artist_information(self): with open(artist_description_file, 'w', encoding='utf-8') as file: file.write(artist_description) with open(artist_metadata_file, 'w', encoding='utf-8') as file: - #for key,value in artist_metadata.items(): - #file.write(f'{key} : {value}\n') - json.dump(artist_metadata, file) + json.dump(artist_metadata, file, ensure_ascii=False) def get_artist_works(self): @@ -221,8 +220,10 @@ def get_artist_description(self): # Get the Wikipedia page page = wiki.page(title) # Get summary of the page (lead section of the Wikipedia article) - description = page.summary + description = unquote(page.summary) + description = unquote(description) + return description def get_artist_metadata(self): @@ -259,7 +260,7 @@ def get_artist_metadata(self): # Assemble metadata in a dictionary metadata = {re.sub(r'(\B[A-Z])', r' \1', property).lower(): \ self._get_property(data, property) for property in properties} - + return metadata @@ -297,8 +298,10 @@ def get_wikipedia_article_title(self): # Get title of artist's Wikipedia article title = wikipedia_link.rsplit('/')[-1] + title = unquote(title) + return title - + def get_artist_wikidata_id(self): ''' @@ -345,6 +348,7 @@ def _get_property(self, data, query_property): if query_property+'Label' in data['results']['bindings'][0].keys(): for element in data['results']['bindings']: output_property = element[query_property+'Label']['value'] + output_property = unquote(output_property) # Avoid duplicates if output_property not in output_property_list: output_property_list.append(output_property) diff --git a/examples/example_collect_all_artworks_url_decoding.ipynb b/examples/example_collect_all_artworks_url_decoding.ipynb new file mode 100644 index 0000000..1be6366 --- /dev/null +++ b/examples/example_collect_all_artworks_url_decoding.ipynb @@ -0,0 +1,457 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d83d0691", + "metadata": {}, + "source": [ + "# Example: Artist Zdeněk Sýkora " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "194ac6af", + "metadata": {}, + "outputs": [], + "source": [ + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8583c7c0", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c34d9016", + "metadata": {}, + "outputs": [], + "source": [ + "# Maximum number of attempts to perform a task \n", + "max_retries = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c7c7aed1", + "metadata": {}, + "outputs": [], + "source": [ + "min_wait_time = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43ae9afa", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory in which the data is to be stored\n", + "output_dir = './data'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8d21abe7", + "metadata": {}, + "outputs": [], + "source": [ + "# Artist Zdeněk Sýkora, 3 artworks\n", + "artist_urls = ['https://artsandculture.google.com/entity/zden%C4%9Bk-s%C3%BDkora/m0gyrctv']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "eb623d6b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork URL: https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\n", + "artwork URL: https://artsandculture.google.com/asset/black-lines-zden%C4%9Bk-s%C3%BDkora/NwEzX_kYKTNDOA\n", + "artwork URL: https://artsandculture.google.com/asset/black-and-white-structure-zdenek-sykora/6gGt7GEf2h9Yuw\n", + "artwork URL: https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\n", + "CPU times: user 54.9 ms, sys: 23.2 ms, total: 78.1 ms\n", + "Wall time: 9.75 s\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "# Find_artworks for each artist\n", + "for artist_url in artist_urls:\n", + " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", + " min_wait_time=min_wait_time) as scraper:\n", + " # Save list of works, description, and metadata for an artist\n", + " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", + " # Create directory for this artist\n", + " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", + " # Get list of links to this artist's works \n", + " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", + " artwork_links = [line.rstrip() for line in file] \n", + " # Scrape artworks\n", + " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", + " # Go through each artwork link\n", + " for url in artwork_links:\n", + " print(f'artwork URL: {url}')\n", + " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" + ] + }, + { + "cell_type": "markdown", + "id": "3519fb2e", + "metadata": {}, + "source": [ + "## Display results" + ] + }, + { + "cell_type": "markdown", + "id": "f8cfeece", + "metadata": {}, + "source": [ + "### Display contents of data directory" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9f56346a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A._Y._Jackson Clementine_Hunter Hokusai Hokusai_old\tZdeněk_Sýkora\r\n" + ] + } + ], + "source": [ + "!ls data" + ] + }, + { + "cell_type": "markdown", + "id": "eff822f0", + "metadata": {}, + "source": [ + "### Display contents of directory for one artist" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fc4d3d90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "description.txt metadata.json\tworks works.txt\r\n" + ] + } + ], + "source": [ + "!ls ./data/Zdeněk_Sýkora" + ] + }, + { + "cell_type": "markdown", + "id": "e0921cb7", + "metadata": {}, + "source": [ + "### Description of artist" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "38079197", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Zdeněk Sýkora (February 3, 1920 – July 12, 2011) was a Czechoslovakian modern abstract painter and sculptor, and a pioneer of using computers in art." + ] + } + ], + "source": [ + "!cat ./data/Zdeněk_Sýkora/description.txt" + ] + }, + { + "cell_type": "markdown", + "id": "322e9c5b", + "metadata": {}, + "source": [ + "### Metadata of artist" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2e5ef192", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"family name\": \"Sýkora\", \"given name\": \"Zdeněk\", \"sex or gender\": \"male\", \"date of birth\": \"1920-02-03\", \"place of birth\": [\"Paceřice\", \"Louny\"], \"latitude of place of birth\": [\"50.619292049\", \"50.357078457\"], \"longitude of place of birth\": [\"15.113627963\", \"13.796762432\"], \"date of death\": \"2011-07-12\", \"place of death\": \"Louny\", \"latitude of place of death\": \"50.357078457\", \"longitude of place of death\": \"13.796762432\", \"country of citizenship\": [\"Czech Republic\", \"Czechoslovakia\"], \"residence\": \"\", \"work location\": \"Louny\", \"genre\": \"\", \"movement\": \"\", \"occupation\": [\"teacher\", \"architect\", \"painter\", \"sculptor\", \"graphic artist\"]}" + ] + } + ], + "source": [ + "!cat ./data/Zdeněk_Sýkora/metadata.json" + ] + }, + { + "cell_type": "markdown", + "id": "63251f32", + "metadata": {}, + "source": [ + "### Directory containing works of this artist" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1cd0d995", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "black-and-white-structure-zdenek-sykora_6gGt7GEf2h9Yuw\r\n", + "black-lines-zdeněk-sýkora_NwEzX_kYKTNDOA\r\n", + "line-no-56-humberto-zdeněk-sýkora_XgGPSy53OVWyaw\r\n" + ] + } + ], + "source": [ + "!ls ./data/Zdeněk_Sýkora/works" + ] + }, + { + "cell_type": "markdown", + "id": "4c20d8c2", + "metadata": {}, + "source": [ + "### Directory containing one artwork by this artist" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "256919d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork.png metadata.json\r\n" + ] + } + ], + "source": [ + "!ls ./data/Zdeněk_Sýkora/works/line-no-56-humberto-zdeněk-sýkora_XgGPSy53OVWyaw" + ] + }, + { + "cell_type": "markdown", + "id": "6829e0a2", + "metadata": {}, + "source": [ + "### Display metadata for this artwork" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b5504ef7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"main_text\": \"The Fifty-sixth Line (in this simple way the artist has been naming his paintings since 1974) takes an important place in Sýkora´s work baceuse of its vigorous colour scheme. The first pictures on this theme were painted in 1973, when the preceding and no less exciting stage of his work was closed - the ten-year-long period of structures. As early as 1964, Zdeněk Sýkora, probably the first painter in the history of art, made use of the computer as an auxiliary tool in the solution of the composition of the picture. The programme for determining the position of each element in the structure was devised jointly with the mathematician Jaroslav Blažek. The structures were subject to strict, predetermined rules, the computer kept the rules and solved the assignment. On the other hand the \\\"earthworms\\\" - as the artist sometimes called his line pictures - are based on the application of chance in the system evolved by the artist. In 1985 his wife Lenka began to cooperate on its development. Thus each picture at firts existed as a numerical score, which was then transferred in classical way, by paint, on the canvas, as if a composition was created from the music notes. And why did the painter give the picture another name? He says that he himself was suprised by the way in which the \\\"crazy\\\" picure reminiscent of a circus had originated. The painting thus went beyond his expectations in all respects, he even had to get accustomed to the strange combinations of colours so that for one year Humberto was stored in the next room. In the end, it became a favourite representative of Sýkora´s art.\", \"title\": \"Line no. 56 (Humberto)\", \"creator\": \"Zdeněk Sýkora\", \"date\": \"1988\", \"physical dimensions\": \"200 x 200 cm\", \"medium\": \"acrylic, canvas\", \"id\": \"XgGPSy53OVWyaw\", \"link\": \"https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\"}" + ] + } + ], + "source": [ + "!cat ./data/Zdeněk_Sýkora/works/line-no-56-humberto-zdeněk-sýkora_XgGPSy53OVWyaw/metadata.json" + ] + }, + { + "cell_type": "markdown", + "id": "2810ec59", + "metadata": {}, + "source": [ + "# Example artwork: Kajikazawa in Kai Province, from the series thirty-six views of Mount Fuji " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c783a9ad", + "metadata": {}, + "outputs": [], + "source": [ + "artwork_links = ['https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e38b6ff7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork URL: https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\n" + ] + } + ], + "source": [ + "with GoogleArtScraper(output_dir + '/' + 'Hokusai' + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", + " # Go through each artwork link\n", + " for url in artwork_links:\n", + " print(f'artwork URL: {url}')\n", + " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" + ] + }, + { + "cell_type": "markdown", + "id": "95797323", + "metadata": {}, + "source": [ + "## Display results" + ] + }, + { + "cell_type": "markdown", + "id": "77f7868f", + "metadata": {}, + "source": [ + "### Directory containing works of this artist" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5bd7654a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "冨嶽三十六景 甲州石班沢-kajikazawa-in-kai-province-kōshū-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanjūrokkei-katsushika-hokusai_hgHQaDeXBcllwg\r\n" + ] + } + ], + "source": [ + "!ls ./data/Hokusai/works" + ] + }, + { + "cell_type": "markdown", + "id": "2a521a4c", + "metadata": {}, + "source": [ + "### Directory containing one artwork by this artist" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7a85d111", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork.png metadata.json\r\n" + ] + } + ], + "source": [ + "!ls ./data/Hokusai/works/冨嶽三十六景 甲州石班沢-kajikazawa-in-kai-province-kōshū-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanjūrokkei-katsushika-hokusai_hgHQaDeXBcllwg" + ] + }, + { + "cell_type": "markdown", + "id": "76b31701", + "metadata": {}, + "source": [ + "### Display metadata for this artwork" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2e5d6e6d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"main_text\": \"\", \"title\": \"冨嶽三十六景 甲州石班沢|Kajikazawa in Kai Province (Kōshū Kajikazawa), from the series Thirty-six Views of Mount Fuji (Fugaku sanjūrokkei)\", \"creator\": \"Katsushika Hokusai\", \"date created\": \"ca. 1830–32\", \"physical dimensions\": \"10 1/4 x 15 1/8 in. (26 x 38.4 cm)\", \"type\": \"Woodblock print\", \"external link\": \" http://www.metmuseum.org/art/collection/search/39800\", \"medium\": \"Polychrome woodblock print; ink and color on paper\", \"repository\": \"Metropolitan Museum of Art, New York, NY\", \"period\": \"Edo period (1615–1868)\", \"culture\": \"Japan\", \"id\": \"hgHQaDeXBcllwg\", \"link\": \"https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\"}" + ] + } + ], + "source": [ + "!cat ./data/Hokusai/works/冨嶽三十六景 甲州石班沢-kajikazawa-in-kai-province-kōshū-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanjūrokkei-katsushika-hokusai_hgHQaDeXBcllwg/metadata.json" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 17db97145501aa6f4558fb6abdd16af15f9e0ae5 Mon Sep 17 00:00:00 2001 From: modhurita Date: Tue, 4 Jul 2023 15:56:10 +0200 Subject: [PATCH 09/12] fix markdown header --- examples/example_collect_all_artworks_url_decoding.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/example_collect_all_artworks_url_decoding.ipynb b/examples/example_collect_all_artworks_url_decoding.ipynb index 1be6366..c170a01 100644 --- a/examples/example_collect_all_artworks_url_decoding.ipynb +++ b/examples/example_collect_all_artworks_url_decoding.ipynb @@ -5,7 +5,7 @@ "id": "d83d0691", "metadata": {}, "source": [ - "# Example: Artist Zdeněk Sýkora " + "# Example artist: Zdeněk Sýkora " ] }, { @@ -348,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "95797323", + "id": "aafd91c3", "metadata": {}, "source": [ "## Display results" From 9fd68f120fa32cab6765faf994372ff74b22a490 Mon Sep 17 00:00:00 2001 From: modhurita Date: Tue, 4 Jul 2023 17:35:37 +0200 Subject: [PATCH 10/12] fix pylint errors --- artscraper/base.py | 2 +- artscraper/find_artworks.py | 17 +++++----- artscraper/functions.py | 34 ++++++++++++------- artscraper/googleart.py | 15 ++++++-- ...le_collect_all_artworks_url_decoding.ipynb | 2 +- 5 files changed, 44 insertions(+), 26 deletions(-) diff --git a/artscraper/base.py b/artscraper/base.py index 8f7b07a..b3224a3 100644 --- a/artscraper/base.py +++ b/artscraper/base.py @@ -141,7 +141,7 @@ def save_metadata(self, meta_fp=None): return metadata = self.get_metadata() self.paint_dir.mkdir(exist_ok=True) - with open(meta_fp, "w") as f: + with open(meta_fp, "w", encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False) @abstractmethod diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 84292ac..407c05a 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -14,9 +14,9 @@ import time import re -import requests -import json from urllib.parse import unquote +import json +import requests from selenium import webdriver @@ -96,7 +96,7 @@ def __init__(self, artist_link, executable_path='geckodriver', OPTIONAL { wd:person_id wdt:P937 ?workLocation. } OPTIONAL { wd:person_id wdt:P136 ?genre. } OPTIONAL { wd:person_id wdt:P135 ?movement. } - OPTIONAL { wd:person_id wdt:P106 ?occupation. } + OPTIONAL { wd:person_id wdt:P106 ?occupation. } SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } } ''' @@ -223,7 +223,7 @@ def get_artist_description(self): description = unquote(page.summary) description = unquote(description) - + return description def get_artist_metadata(self): @@ -245,7 +245,8 @@ def get_artist_metadata(self): query = self.sparql_query.replace('person_id', artist_id) # Send query request - request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=120) + request = requests.get(url, params={'format': 'json', \ + 'query': ''.join(query)}, timeout=120) # Convert response to dictionary data = request.json() @@ -260,7 +261,7 @@ def get_artist_metadata(self): # Assemble metadata in a dictionary metadata = {re.sub(r'(\B[A-Z])', r' \1', property).lower(): \ self._get_property(data, property) for property in properties} - + return metadata @@ -299,9 +300,9 @@ def get_wikipedia_article_title(self): title = wikipedia_link.rsplit('/')[-1] title = unquote(title) - + return title - + def get_artist_wikidata_id(self): ''' diff --git a/artscraper/functions.py b/artscraper/functions.py index 84d0778..f463a01 100644 --- a/artscraper/functions.py +++ b/artscraper/functions.py @@ -1,10 +1,14 @@ -import time -from random import random +""" + +Functions used repeatedly, and in many places: + +random_wait_time +retry -''' -random_wait_time: Function to determine a random wait time -between two events -''' +""" + +import time +from random import random def random_wait_time(min_wait=5, max_wait=None): """Compute a random wait time. @@ -27,6 +31,7 @@ def random_wait_time(min_wait=5, max_wait=None): Waiting time between `min_wait` and `max_wait` according to the polynomial PDF. """ + # pylint: disable=invalid-name if max_wait is None: max_wait = 3 * min_wait @@ -45,7 +50,7 @@ def inv_cdf(x): return inv_cdf(random()) -def retry(function, max_retries=10, min_wait_time=10, *args): +def retry(function, max_retries, min_wait_time, *args): ''' Parameters ---------- @@ -57,16 +62,19 @@ def retry(function, max_retries=10, min_wait_time=10, *args): ------- Value returned by function, or prints an error message ''' - + + # Want to catch all kinds of exceptions + # pylint: disable=broad-except + num_attempt = 0 while num_attempt < max_retries: - + try: return function(*args) - except Exception as e: - #print(f'Function {function} failed at attempt {num_attempt} with exception {repr(e)}: {str(e)}') - print(f'Function {function} failed at attempt {num_attempt} with exception {repr(e)}') + except Exception as error: + print(f'Function {function} failed at attempt {num_attempt} \ + with exception {repr(error)}') time.sleep(random_wait_time(min_wait=min_wait_time)) num_attempt = num_attempt + 1 - + return None diff --git a/artscraper/googleart.py b/artscraper/googleart.py index c6def72..78e6951 100755 --- a/artscraper/googleart.py +++ b/artscraper/googleart.py @@ -44,7 +44,7 @@ def load_link(self, link): return False self.link = link - if self.output_dir is not None: + if self.output_dir is not None: if (self.paint_dir.is_dir() and self.skip_existing and Path(self.paint_dir, "metadata.json").is_file() and Path(self.paint_dir, "artwork.png").is_file()): @@ -58,13 +58,13 @@ def load_link(self, link): @property def paint_dir(self): paint_id = "_".join(urlparse(self.link).path.split("/")[-2:]) - + # Prevent problems with character encoding/decoding paint_id = unquote(paint_id) # Prevent problems with too-long file/directory names if len(paint_id)>=256: paint_id = paint_id[0:255] - + return Path(self.output_dir, paint_id) def wait(self, min_wait, max_wait=None, update=True): @@ -168,6 +168,15 @@ def save_image(self, img_fp=None, link=None): f.write(self.get_image()) def save_artwork_information(self, link): + """ + Given an artwork link, saves the image and the associated metadata. + + Parameters + ---------- + link: str + Artwork URL. + + """ self.load_link(link) self.save_metadata() self.save_image() diff --git a/examples/example_collect_all_artworks_url_decoding.ipynb b/examples/example_collect_all_artworks_url_decoding.ipynb index c170a01..930ecbb 100644 --- a/examples/example_collect_all_artworks_url_decoding.ipynb +++ b/examples/example_collect_all_artworks_url_decoding.ipynb @@ -348,7 +348,7 @@ }, { "cell_type": "markdown", - "id": "aafd91c3", + "id": "8fb60484", "metadata": {}, "source": [ "## Display results" From 8ca7bf02aaf3660a89e360d36128014502b7ec1b Mon Sep 17 00:00:00 2001 From: modhurita Date: Thu, 6 Jul 2023 15:12:30 +0200 Subject: [PATCH 11/12] fix errors related to selenium update --- artscraper/find_artists.py | 7 +++++-- artscraper/find_artworks.py | 8 ++++---- artscraper/googleart.py | 8 +++++--- .../example_collect_all_artworks_url_decoding.ipynb | 12 ++++++++++-- setup.py | 3 ++- 5 files changed, 26 insertions(+), 12 deletions(-) diff --git a/artscraper/find_artists.py b/artscraper/find_artists.py index 90a4933..cbcb268 100644 --- a/artscraper/find_artists.py +++ b/artscraper/find_artists.py @@ -3,11 +3,14 @@ ''' import time + from selenium import webdriver +from selenium.webdriver.firefox.service import Service as FirefoxService +from webdriver_manager.firefox import GeckoDriverManager + from artscraper.functions import random_wait_time def get_artist_links(webpage='https://artsandculture.google.com/category/artist', - executable_path='geckodriver', min_wait_time=5, output_file=None): ''' Parameters @@ -22,7 +25,7 @@ def get_artist_links(webpage='https://artsandculture.google.com/category/artist' ''' # Launch Firefox browser - driver = webdriver.Firefox(executable_path=executable_path) + driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) # Get Google Arts & Culture webpage listing all artists driver.get(webpage) diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index 407c05a..dbdc06d 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -19,6 +19,8 @@ import requests from selenium import webdriver +from selenium.webdriver.firefox.service import Service as FirefoxService +from webdriver_manager.firefox import GeckoDriverManager import wikipediaapi @@ -30,13 +32,11 @@ class FindArtworks: given the link to their Google Arts & Culture webpage ''' - def __init__(self, artist_link, executable_path='geckodriver', + def __init__(self, artist_link, output_dir='./data', sparql_query= None, min_wait_time=5): # Link to artist's Google Arts & Culture webpage self.artist_link = artist_link - # Path to geckodriver - self.executable_path = executable_path # Directory to which the data is to be written # Create it if it doesn't exist Path(output_dir).mkdir(parents=True, exist_ok=True) @@ -104,7 +104,7 @@ def __init__(self, artist_link, executable_path='geckodriver', self.sparql_query = sparql_query # Open web browser - self.driver = webdriver.Firefox(executable_path=self.executable_path) + self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) def __enter__(self): diff --git a/artscraper/googleart.py b/artscraper/googleart.py index 78e6951..ce87f49 100755 --- a/artscraper/googleart.py +++ b/artscraper/googleart.py @@ -11,6 +11,8 @@ from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys +from selenium.webdriver.firefox.service import Service as FirefoxService +from webdriver_manager.firefox import GeckoDriverManager from artscraper.base import BaseArtScraper from artscraper.functions import random_wait_time @@ -30,10 +32,10 @@ class GoogleArtScraper(BaseArtScraper): is randomly drawn from a polynomial distribution. """ - def __init__(self, output_dir=None, skip_existing=True, min_wait=5, - geckodriver_path="geckodriver"): + def __init__(self, output_dir=None, skip_existing=True, min_wait=5): super().__init__(output_dir, skip_existing, min_wait=min_wait) - self.driver = webdriver.Firefox(executable_path=geckodriver_path) + + self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) self.last_request = time.time() - 100 def __exit__(self, _exc_type, _exc_val, _exc_tb): diff --git a/examples/example_collect_all_artworks_url_decoding.ipynb b/examples/example_collect_all_artworks_url_decoding.ipynb index 930ecbb..cd7a5e8 100644 --- a/examples/example_collect_all_artworks_url_decoding.ipynb +++ b/examples/example_collect_all_artworks_url_decoding.ipynb @@ -79,6 +79,14 @@ "scrolled": true }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WDM] - Downloading: 19.2kB [00:00, 9.56MB/s] \n", + "[WDM] - Downloading: 100%|█████████████████| 2.93M/2.93M [00:00<00:00, 75.4MB/s]\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -87,8 +95,8 @@ "artwork URL: https://artsandculture.google.com/asset/black-lines-zden%C4%9Bk-s%C3%BDkora/NwEzX_kYKTNDOA\n", "artwork URL: https://artsandculture.google.com/asset/black-and-white-structure-zdenek-sykora/6gGt7GEf2h9Yuw\n", "artwork URL: https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\n", - "CPU times: user 54.9 ms, sys: 23.2 ms, total: 78.1 ms\n", - "Wall time: 9.75 s\n" + "CPU times: user 267 ms, sys: 35 ms, total: 302 ms\n", + "Wall time: 11 s\n" ] } ], diff --git a/setup.py b/setup.py index 39317f2..be1eaad 100644 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ "requests", "selenium", "beautifulsoup4", - "wikipedia-api" + "wikipedia-api", + "webdriver-manager", ] ) From 62579f2156d185c094bad1dce2b9b86be6d9a84f Mon Sep 17 00:00:00 2001 From: modhurita Date: Fri, 7 Jul 2023 09:46:11 +0200 Subject: [PATCH 12/12] fix wikidata timeout problems with sparql query --- artscraper/base.py | 1 + artscraper/find_artworks.py | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/artscraper/base.py b/artscraper/base.py index b3224a3..adb3712 100644 --- a/artscraper/base.py +++ b/artscraper/base.py @@ -142,6 +142,7 @@ def save_metadata(self, meta_fp=None): metadata = self.get_metadata() self.paint_dir.mkdir(exist_ok=True) with open(meta_fp, "w", encoding='utf-8') as f: + # pylint: disable=invalid-name json.dump(metadata, f, ensure_ascii=False) @abstractmethod diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index dbdc06d..83dea93 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -52,6 +52,7 @@ def __init__(self, artist_link, SELECT ?familyName ?familyNameLabel ?givenName ?givenNameLabel + ?pseudonym ?pseudonymLabel ?sexOrGender ?sexOrGenderLabel ?dateOfBirth ?dateOfBirthLabel ?placeOfBirth ?placeOfBirthLabel @@ -70,26 +71,27 @@ def __init__(self, artist_link, WHERE { OPTIONAL { wd:person_id wdt:P734 ?familyName. } OPTIONAL { wd:person_id wdt:P735 ?givenName. } + OPTIONAL { wd:person_id wdt:P742 ?pseudonym. } OPTIONAL { wd:person_id wdt:P21 ?sexOrGender. } OPTIONAL { wd:person_id wdt:P569 ?dateTimeOfBirth. BIND (xsd:date(?dateTimeOfBirth) AS ?dateOfBirth) } - OPTIONAL { wd:person_id wdt:P19 ?placeOfBirth. } - OPTIONAL { - ?placeOfBirth wdt:P625 ?coordinatesBirth. - BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth) - BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth) + OPTIONAL { + wd:person_id wdt:P19 ?placeOfBirth. + ?placeOfBirth wdt:P625 ?coordinatesBirth. + BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth) + BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth) } OPTIONAL { wd:person_id wdt:P570 ?dateTimeOfDeath. BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath) } - OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. } - OPTIONAL { - ?placeOfDeath wdt:P625 ?coordinatesDeath. - BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath) - BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath) + OPTIONAL { + wd:person_id wdt:P20 ?placeOfDeath. + ?placeOfDeath wdt:P625 ?coordinatesDeath. + BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath) + BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath) } OPTIONAL { wd:person_id wdt:P27 ?countryOfCitizenship. } OPTIONAL { wd:person_id wdt:P551 ?residence. } @@ -185,14 +187,13 @@ def get_artist_works(self): # Check if right arrow button can still be clicked while right_arrow_element.get_attribute('tabindex') is not None: + # Wait for page to load time.sleep(random_wait_time(min_wait=self.min_wait_time)) # Find right arrow button right_arrow_element = parent_element.find_element('xpath', \ './/*[contains(@data-gaaction,"rightArrow")]') # Click on right arrow button self.driver.execute_script("arguments[0].click();", right_arrow_element) - # Wait for page to load - #time.sleep(random_wait_time(min_wait=self.min_wait_time)) # List of all elements with links to artworks elements = right_arrow_element.find_elements('xpath', \ @@ -220,7 +221,7 @@ def get_artist_description(self): # Get the Wikipedia page page = wiki.page(title) # Get summary of the page (lead section of the Wikipedia article) - description = unquote(page.summary) + description = page.summary description = unquote(description)