diff --git a/artscraper/find_artworks.py b/artscraper/find_artworks.py index f4b38ed..63e8ffa 100644 --- a/artscraper/find_artworks.py +++ b/artscraper/find_artworks.py @@ -7,13 +7,11 @@ ''' -# Allow __init__ function to have more than 5 arguments -#pylint: disable-msg=too-many-arguments - from pathlib import Path import time import re +from urllib.parse import urlparse from urllib.parse import unquote import json import requests @@ -30,7 +28,10 @@ class FindArtworks: ''' Class for finding artworks and metadata for an artist, given the link to their Google Arts & Culture webpage - ''' + unquote ''' + + # Allow __init__ function to have more than 5 arguments + # pylint: disable-msg=too-many-arguments def __init__(self, artist_link, output_dir='./data', sparql_query= None, min_wait_time=5): @@ -77,7 +78,7 @@ def __init__(self, artist_link, wd:person_id wdt:P569 ?dateTimeOfBirth. BIND (xsd:date(?dateTimeOfBirth) AS ?dateOfBirth) } - OPTIONAL { + OPTIONAL { wd:person_id wdt:P19 ?placeOfBirth. ?placeOfBirth wdt:P625 ?coordinatesBirth. BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth) @@ -87,7 +88,7 @@ def __init__(self, artist_link, wd:person_id wdt:P570 ?dateTimeOfDeath. BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath) } - OPTIONAL { + OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. ?placeOfDeath wdt:P625 ?coordinatesDeath. BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath) @@ -142,26 +143,53 @@ def save_artist_information(self): ''' artist_works, artist_description, artist_metadata = self.get_artist_information() - artist_name = self.get_wikipedia_article_title() - + artist_name = self.get_artist_name() # Create directory for artist pathname_directory = self.output_dir + '/' + artist_name Path(pathname_directory).mkdir(parents=True, exist_ok=True) - # Filenames for artist's works, description, metadata artist_works_file = pathname_directory + '/' + 'works.txt' artist_description_file = pathname_directory + '/' + 'description.txt' artist_metadata_file = pathname_directory + '/' + 'metadata.json' - # Save artist's works, description, metadata with open(artist_works_file, 'w', encoding='utf-8') as file: for link in artist_works: file.write(f'{link}\n') with open(artist_description_file, 'w', encoding='utf-8') as file: - file.write(artist_description) + if artist_description is not None: + file.write(artist_description) with open(artist_metadata_file, 'w', encoding='utf-8') as file: - json.dump(artist_metadata, file, ensure_ascii=False) + if artist_metadata is not None: + json.dump(artist_metadata, file, ensure_ascii=False) + + + def get_artist_name(self): + + ''' + Return artist's name, with parts thereof being separated by underscores + ''' + + # Artist link + artist_link = self.artist_link + + # Split the link by forward slashes + parts = artist_link.split('/') + # Extract the artist's name (separated by dashes) + artist_name_string = parts[4] + artist_name_string = unquote(artist_name_string) + # Split the artist's name into component parts + artist_name_parts = artist_name_string.split('-') + # Capitalize each component + artist_name_capitalized_parts = [] + for part in artist_name_parts: + part = part.capitalize() + artist_name_capitalized_parts.append(part) + + # Artist's name, separated by underscores + artist_name = ('_').join(artist_name_capitalized_parts) + + return artist_name def get_artist_works(self): @@ -216,8 +244,20 @@ def get_artist_description(self): # Get title of artist's Wikipedia article title = self.get_wikipedia_article_title() - # Choose the English Wikipedia - wiki = wikipediaapi.Wikipedia('en') + + # Return None if no Wikipedia article exists + if title is None: + return None + + # Get link to Wikipedia article + wikipedia_article_link = self.get_wikipedia_article_link() + # Parse the URL + parsed_url = urlparse(wikipedia_article_link) + # Find the language of the Wikipedia article + language_code = parsed_url.netloc.split('.')[0] + + # Choose the Wikipedia corresponding to the language code + wiki = wikipediaapi.Wikipedia(language_code) # Get the Wikipedia page page = wiki.page(title) # Get summary of the page (lead section of the Wikipedia article) @@ -238,6 +278,8 @@ def get_artist_metadata(self): # Get Wikidata ID of artist artist_id = self.get_artist_wikidata_id() + if artist_id is None: + return None # Wikidata database to query url = 'https://query.wikidata.org/sparql' @@ -277,9 +319,14 @@ def get_wikipedia_article_link(self): # Get Google Arts & Culture webpage for the artist self.driver.get(self.artist_link) + # Allow bare-except + # pylint: disable=W0702 + try: + # Locate the element containing the link to the artist's Wikipedia article + element = self.driver.find_element('xpath','//*[contains(@href,"wikipedia")]') + except: + return None - # Locate the element containing the link to the artist's Wikipedia article - element = self.driver.find_element('xpath','//*[contains(@href,"wikipedia")]') # Extract the link to the Wikipedia article wikipedia_link = element.get_attribute('href') @@ -297,12 +344,13 @@ def get_wikipedia_article_title(self): # Get the link to the artist's Wikipedia article wikipedia_link = self.get_wikipedia_article_link() - # Get title of artist's Wikipedia article - title = wikipedia_link.rsplit('/')[-1] - - title = unquote(title) + if wikipedia_link is not None: + # Get title of artist's Wikipedia article + title = wikipedia_link.rsplit('/')[-1] + title = unquote(title) + return title - return title + return None def get_artist_wikidata_id(self): @@ -312,24 +360,25 @@ def get_artist_wikidata_id(self): wikidata_id: Wikidata ID of the artist ''' - # Get Google Arts & Culture webpage for the artist - self.driver.get(self.artist_link) - - # Locate the element containing the link to the artist's Wikipedia article - element = self.driver.find_element('xpath','//*[contains(@href,"wikipedia")]') - # Extract the link to the Wikipedia article - wikipedia_link = element.get_attribute('href') + # Get the link to the Wikipedia article + wikipedia_link = self.get_wikipedia_article_link() # Get Wikipedia page for the artist - self.driver.get(wikipedia_link) - # Find element containing text about Wikidata - element = self.driver.find_element('xpath','//*[contains(text(),"Wikidata item")]') - # Find parent element of this element - parent_element = element.find_element('xpath', '..') + if wikipedia_link is not None: + self.driver.get(wikipedia_link) + else: + return None + + # Find element containing the link to the Wikidata page + element = self.driver.find_element('xpath','//*[contains(@href,"www.wikidata.org")]') # Extract the link to the Wikidata page - wikidata_link = parent_element.get_attribute('href') + wikidata_link = element.get_attribute('href') + # Specify pattern of Wikidata ID + pattern = r'Q\d+' + # Search for pattern in the Wikidata link + match = re.search(pattern, wikidata_link) # Find the Wikidata ID of the artist - wikidata_id = wikidata_link.rsplit('/')[-1] + wikidata_id = match.group(0) return wikidata_id diff --git a/examples/example_wikipedia_issues_resolved.ipynb b/examples/example_wikipedia_issues_resolved.ipynb new file mode 100644 index 0000000..56ef45d --- /dev/null +++ b/examples/example_wikipedia_issues_resolved.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d83d0691", + "metadata": {}, + "source": [ + "# Artist without a description, and artist with a non-English Wikipedia article " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "194ac6af", + "metadata": {}, + "outputs": [], + "source": [ + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8583c7c0", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c34d9016", + "metadata": {}, + "outputs": [], + "source": [ + "# Maximum number of attempts to perform a task \n", + "max_retries = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c7c7aed1", + "metadata": {}, + "outputs": [], + "source": [ + "min_wait_time = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43ae9afa", + "metadata": {}, + "outputs": [], + "source": [ + "# Directory in which the data is to be stored\n", + "output_dir = './data'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8d21abe7", + "metadata": {}, + "outputs": [], + "source": [ + "# Artist urls\n", + "artist_urls = ['https://artsandculture.google.com/entity/esther-teichmann/g113vf7r7v',\n", + " 'https://artsandculture.google.com/entity/cornelis-albertus-johannes-schermer/g11bw5_6rgd']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "eb623d6b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/3gEvP4cE7Pyrag\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/3gGO_pWJeaxCQg\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/5wGcqEv2h2Cv6Q\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/3gEvP4cE7Pyrag\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/QgGLOdNbTYmr7g\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/dQEUb6SPOTxllQ\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/vgHV_6ie8P_Pcw\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/ygGTujVCOMXcWA\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/IwEn4wyeGt_tHA\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/fwGoPLZSHtLn2A\n", + "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/zAE5-HfupzhWJg\n", + "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/cwEOXoCnybuhYA\n", + "artwork URL: https://artsandculture.google.com/asset/ruiter-in-uniform-van-achter-gezien-schermer-cornelis-albertus-johannes/cwG4U2LYgzpmkQ\n", + "artwork URL: https://artsandculture.google.com/asset/landschap-met-cavaleristen-schermer-cornelis-albertus-johannes/YwG0PJ20ZMTI-Q\n", + "artwork URL: https://artsandculture.google.com/asset/ruiter-in-uniform-van-achteren-gezien-schermer-cornelis-albertus-johannes/0wGa8wyBsKqwIg\n", + "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/mAHxJYt_tYhFqw\n", + "artwork URL: https://artsandculture.google.com/asset/landschap-met-cavaleristen-schermer-cornelis-albertus-johannes/KwFJL6DQpYzK1w\n", + "artwork URL: https://artsandculture.google.com/asset/cavaleriepaard-en-een-lancier-schermer-cornelis-albertus-johannes/FQGMgcNN9og85g\n", + "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/cwEOXoCnybuhYA\n", + "artwork URL: https://artsandculture.google.com/asset/weiland-met-een-paard-en-twee-koeien-schermer-cornelis-albertus-johannes/NwHUHRjHrk-tWw\n", + "artwork URL: https://artsandculture.google.com/asset/ruiters-met-paarden-schermer-cornelis-albertus-johannes/3wHC3D3PLFXz9w\n", + "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/7QGRGjdJFlusww\n", + "artwork URL: https://artsandculture.google.com/asset/landschap-met-cavaleristen-schermer-cornelis-albertus-johannes/oAFFsCDCMIg2sw\n", + "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/cQH-x8lIGO0IPQ\n", + "CPU times: user 102 ms, sys: 66.8 ms, total: 169 ms\n", + "Wall time: 16.9 s\n" + ] + } + ], + "source": [ + "%%time \n", + "\n", + "# Find_artworks for each artist\n", + "for artist_url in artist_urls:\n", + " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", + " min_wait_time=min_wait_time) as scraper:\n", + " # Save list of works, description, and metadata for an artist\n", + " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", + " # Create directory for this artist\n", + " artist_dir = output_dir + '/' + scraper.get_artist_name() \n", + " # Get list of links to this artist's works \n", + " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", + " artwork_links = [line.rstrip() for line in file] \n", + " # Scrape artworks\n", + " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", + " # Go through each artwork link\n", + " for url in artwork_links:\n", + " print(f'artwork URL: {url}')\n", + " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" + ] + }, + { + "cell_type": "markdown", + "id": "3519fb2e", + "metadata": {}, + "source": [ + "## Display results" + ] + }, + { + "cell_type": "markdown", + "id": "f8cfeece", + "metadata": {}, + "source": [ + "### Display contents of data directory" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9f56346a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cornelis_Albertus_Johannes_Schermer Esther_Teichmann\r\n" + ] + } + ], + "source": [ + "!ls data" + ] + }, + { + "cell_type": "markdown", + "id": "eff822f0", + "metadata": {}, + "source": [ + "### Display contents of directory for artist Esther Teichmann" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fc4d3d90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "description.txt metadata.json\tworks works.txt\r\n" + ] + } + ], + "source": [ + "!ls ./data/Esther_Teichmann" + ] + }, + { + "cell_type": "markdown", + "id": "e0921cb7", + "metadata": {}, + "source": [ + "#### Description of artist" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "38079197", + "metadata": {}, + "outputs": [], + "source": [ + "!cat ./data/Esther_Teichmann/description.txt" + ] + }, + { + "cell_type": "markdown", + "id": "322e9c5b", + "metadata": {}, + "source": [ + "#### Metadata of artist" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2e5ef192", + "metadata": {}, + "outputs": [], + "source": [ + "!cat ./data/Esther_Teichmann/metadata.json" + ] + }, + { + "cell_type": "markdown", + "id": "163c0111", + "metadata": {}, + "source": [ + "### Display contents of directory for artist Cornelis Albertus Johannes Schermer" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "722d2e48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "description.txt metadata.json\tworks works.txt\r\n" + ] + } + ], + "source": [ + "!ls ./data/Cornelis_Albertus_Johannes_Schermer" + ] + }, + { + "cell_type": "markdown", + "id": "719e56e1", + "metadata": {}, + "source": [ + "#### Description of artist" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1e0fb399", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cornelis Albertus Johannes Schermer (* 12. Juni 1824 in Den Haag; † 4. Januar 1915 ebenda) war ein niederländischer Pferdemaler und Radierer.\r\n", + "Schermer war von 1841 bis 1844 Student der Koninklijke Academie van Beeldende Kunsten in Den Haag unter der Leitung von Cornelis Kruseman und Jacobus Everhardus Josephus van den Berg, Er wurde auch von Joseph Moerenhout (1801–1874) beraten.\r\n", + "1875 belebte er ein Vedutengemälde von Carel Jacobus Behr mit Figuren. \r\n", + "Schermer war in Den Haag und von 1880 bis 1903 in Bouvignies bei Dinant tätig.\r\n", + "Vön 1846 bis 1903 zeigte er seine Werke auf den Ausstellungen in Amsterdam. Den Haag und Rotterdam, signierte seine Werke mit „C. Schermer“." + ] + } + ], + "source": [ + "!cat ./data/Cornelis_Albertus_Johannes_Schermer/description.txt" + ] + }, + { + "cell_type": "markdown", + "id": "93fad724", + "metadata": {}, + "source": [ + "#### Metadata of artist" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a4c5595a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"family name\": \"\", \"given name\": \"Cornelis\", \"pseudonym\": \"\", \"sex or gender\": \"male\", \"date of birth\": \"1824-06-12\", \"place of birth\": \"The Hague\", \"latitude of place of birth\": \"52.08\", \"longitude of place of birth\": \"4.31\", \"date of death\": \"1915-01-04\", \"place of death\": \"The Hague\", \"latitude of place of death\": \"52.08\", \"longitude of place of death\": \"4.31\", \"country of citizenship\": \"Kingdom of the Netherlands\", \"residence\": \"\", \"work location\": \"\", \"genre\": \"\", \"movement\": \"\", \"occupation\": \"painter\"}" + ] + } + ], + "source": [ + "!cat ./data/Cornelis_Albertus_Johannes_Schermer/metadata.json" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}