diff --git a/artscraper/googleart.py b/artscraper/googleart.py index 47c68fc..eb87d48 100755 --- a/artscraper/googleart.py +++ b/artscraper/googleart.py @@ -7,6 +7,8 @@ from urllib.parse import urlparse from urllib.parse import unquote +import hashlib + from bs4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import NoSuchElementException @@ -45,27 +47,49 @@ def load_link(self, link): if link == self.link: return False self.link = link - if self.output_dir is not None: if (self.paint_dir.is_dir() and self.skip_existing and Path(self.paint_dir, "metadata.json").is_file() - and Path(self.paint_dir, "artwork.png").is_file()): + and Path(self.paint_dir, "artwork.png").is_file() + and Path(self.paint_dir, "artwork.png").stat().st_size>0): return False self.paint_dir.mkdir(exist_ok=True, parents=True) - self.wait(self.min_wait) self.driver.get(link) return True @property def paint_dir(self): + paint_id = "_".join(urlparse(self.link).path.split("/")[-2:]) # Prevent problems with character encoding/decoding paint_id = unquote(paint_id) - # Prevent problems with too-long file/directory names - if len(paint_id)>=256: - paint_id = paint_id[0:255] + # Byte string + paint_id_encoded = paint_id.encode('utf-8') + # Length of directory name in bytes + byte_length = len(paint_id_encoded) + + # Prevent problems with too-long directory names + # 255 bytes is the maximum length of a directory on Windows + + # Set maximum length for the part of the directory name derived from the + # Google Arts & Culture url for the artwork + max_byte_length = 240 + hash_length = 40 + if byte_length >= max_byte_length: + truncated_byte_string = paint_id_encoded[:max_byte_length-hash_length] + # Decode back to string, handling possible incomplete character at the end + while True: + try: + truncated_directory_name = truncated_byte_string.decode('utf-8') + break + except UnicodeDecodeError: + # Remove the last byte and try again + truncated_byte_string = truncated_byte_string[:-1] + # Create hopefully-unique directory name that doesn't exceed + # maximum allowed directory length + paint_id = truncated_directory_name + '_' + hashlib.sha1(paint_id_encoded).hexdigest() return Path(self.output_dir, paint_id) @@ -137,14 +161,19 @@ def get_image(self): self.wait(self.min_wait) elem = self.driver.find_element( "xpath", "/html/body/div[3]/div[3]/div/div/div[2]/div[3]") + webdriver.ActionChains( self.driver).move_to_element(elem).click(elem).perform() + self.wait(self.min_wait * 2, update=False) elem = self.driver.find_element( "xpath", "/html/body/div[3]/div[3]/div/div/div[2]/div[3]") + img = elem.screenshot_as_png + self.wait(self.min_wait) self.driver.find_element("xpath", "/html/body").send_keys(Keys.ESCAPE) + return img def save_image(self, img_fp=None, link=None): @@ -164,11 +193,13 @@ def save_image(self, img_fp=None, link=None): img_fp = self._convert_img_fp(img_fp, suffix=".png") - if self.skip_existing and img_fp.is_file(): + if self.skip_existing and img_fp.is_file() and img_fp.stat().st_size!=0: return + with open(img_fp, "wb") as f: f.write(self.get_image()) + def save_artwork_information(self, link): """ Given an artwork link, saves the image and the associated metadata. diff --git a/examples/example_collect_all_artworks.ipynb b/examples/example_collect_all_artworks.ipynb index 3aa86f7..89ac79e 100644 --- a/examples/example_collect_all_artworks.ipynb +++ b/examples/example_collect_all_artworks.ipynb @@ -19,14 +19,12 @@ ] }, { - "cell_type": "code", - "execution_count": 2, - "id": "77cd3e7d", + "cell_type": "raw", + "id": "3718c2c0", "metadata": {}, - "outputs": [], "source": [ "# Get links for all artists, as a list\n", - "#artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')" + "artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')" ] }, { @@ -37,29 +35,38 @@ "# Collect artworks and metadata for all artists" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dadfbb4d", + "metadata": {}, + "outputs": [], + "source": [ + "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" + ] + }, { "cell_type": "code", "execution_count": 3, - "id": "8583c7c0", + "id": "c3a16846", "metadata": {}, "outputs": [], "source": [ - "from artscraper import GoogleArtScraper, FindArtworks" + "# Sample artist link, for illustration purposes\n", + "artist_urls = ['https://artsandculture.google.com/entity/jan-van-der-heyden/m05g5_1']" ] }, { "cell_type": "code", "execution_count": 4, - "id": "c3a16846", + "id": "bba4ec54", "metadata": {}, "outputs": [], "source": [ - "# Small subset of artist links, for illustration purposes\n", - "# (3 artists, each with < 5 artworks)\n", - "artist_urls = ['https://artsandculture.google.com/entity/william-notman/m04mpzj',\n", - " 'https://artsandculture.google.com/entity/alexander-keirincx/m03cxjmm',\n", - " 'https://artsandculture.google.com/entity/abraham-lambertsz-van-den-tempel/m09g78pg'\n", - " ]" + "# Maximum number of attempts to perform a task \n", + "max_retries = 3\n", + "# Minimum time (in seconds) to wait before retrying\n", + "min_wait_time = 10" ] }, { @@ -75,38 +82,30 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "2be1700c", + "execution_count": null, + "id": "09a90d85", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 389 ms, sys: 37.9 ms, total: 427 ms\n", - "Wall time: 10min 18s\n" - ] - } - ], + "outputs": [], "source": [ "%%time \n", "\n", "# Find_artworks for each artist\n", "for artist_url in artist_urls:\n", - " with FindArtworks(artist_link=artist_url, output_dir=output_dir, min_wait_time=10) as scraper:\n", + " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", + " min_wait_time=min_wait_time) as scraper:\n", " # Save list of works, description, and metadata for an artist\n", - " scraper.save_artist_information()\n", - " # Get list of links to this artist's works \n", - " artwork_links = scraper.get_artist_works()\n", + " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", " # Create directory for this artist\n", - " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", + " artist_dir = output_dir + '/' + scraper.get_artist_name() \n", + " # Get list of links to this artist's works \n", + " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", + " artwork_links = [line.rstrip() for line in file] \n", " # Scrape artworks\n", - " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=10) as subscraper:\n", + " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", " # Go through each artwork link\n", " for url in artwork_links:\n", - " subscraper.load_link(url)\n", - " subscraper.save_metadata()\n", - " subscraper.save_image()" + " print(f'artwork URL: {url}')\n", + " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" ] }, { @@ -135,7 +134,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Abraham_Lambertsz_van_den_Tempel Alexander_Keirincx William_Notman\r\n" + "Jan_Van_Der_Heyden Juana_Alicia\r\n" ] } ], @@ -161,12 +160,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "description.txt metadata.txt works works.txt\r\n" + "description.txt metadata.json\tworks works.txt\r\n" ] } ], "source": [ - "!ls ./data/William_Notman/" + "!ls ./data/Jan_Van_Der_Heyden/" ] }, { @@ -187,12 +186,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "William Notman (8 March 1826 – 25 November 1891) was a Scottish-Canadian photographer and businessman. The Notman House in Montreal was his home from 1876 until his death in 1891, and it has since been named after him. Notman was the first photographer in Canada to achieve international recognition." + "Jan van der Heyden (5 March 1637, Gorinchem – 28 March 1712, Amsterdam) was a Dutch Baroque-era painter, glass painter, draughtsman and printmaker. Van der Heyden was one of the first Dutch painters to specialize in townscapes and became one of the leading architectural painters of the Dutch Golden Age. He painted a number of still lifes in the beginning and at the end of his career.Jan van der Heyden was also an engineer and inventor who made significant contributions to contemporary firefighting technology. Together with his brother Nicolaes, who was a hydraulic engineer, he invented an improvement of the fire hose in 1672. He modified the manual fire engine, reorganised the volunteer fire brigade (1685) and wrote and illustrated the first firefighting manual (Brandspuiten-boek). A comprehensive street lighting scheme for Amsterdam, designed and implemented by van der Heyden, remained in operation from 1669 until 1840 and was adopted as a model by many other towns and abroad." ] } ], "source": [ - "!cat ./data/William_Notman/description.txt" + "!cat ./data/Jan_Van_Der_Heyden/description.txt" ] }, { @@ -213,27 +212,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "family name : Notman\r\n", - "given name : William\r\n", - "sex or gender : male\r\n", - "date of birth : 1826-03-08\r\n", - "place of birth : Paisley\r\n", - "latitude of place of birth : 55.845555555\r\n", - "longitude of place of birth : -4.423888888\r\n", - "date of death : 1826-03-08\r\n", - "place of death : Montreal\r\n", - "latitude of place of death : 45.508888888\r\n", - "longitude of place of death : -73.561666666\r\n", - "country of citizenship : ['Canada', 'United Kingdom of Great Britain and Ireland']\r\n", - "residence : Canada\r\n", - "work location : ['New York City', 'Boston', 'Toronto', 'Montreal', 'Philadelphia', 'Ottawa', 'Halifax', 'Niagara Falls', 'Cambridge']\r\n", - "genre : \r\n", - "movement : \r\n" + "{\"family name\": \"Van der Heyden\", \"given name\": \"Jan\", \"pseudonym\": \"\", \"sex or gender\": \"male\", \"date of birth\": \"1637-03-05\", \"place of birth\": \"Gorinchem\", \"latitude of place of birth\": \"51.83652\", \"longitude of place of birth\": \"4.97243\", \"date of death\": \"1712-03-28\", \"place of death\": \"Amsterdam\", \"latitude of place of death\": \"52.372777777\", \"longitude of place of death\": \"4.893611111\", \"country of citizenship\": \"Netherlands\", \"residence\": \"\", \"work location\": \"Amsterdam\", \"genre\": \"landscape art\", \"movement\": \"\", \"occupation\": [\"firefighter\", \"inventor\", \"painter\", \"instrument maker\", \"printmaker\"]}" ] } ], "source": [ - "!cat ./data/William_Notman/metadata.txt" + "!cat ./data/Jan_Van_Der_Heyden/metadata.json" ] }, { @@ -246,23 +230,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "54afc420", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "bald-eagle-white-headed-eagle-william-notman_oQGugt5O8az3bA\r\n", - "montreal-from-street-railway-power-house-chimney-qc__AHTyBmsOEhjaw\r\n", - "mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w\r\n", - "the-terra-nova-snowshoe-club-montreal-notman-william-sandham-henry_OQFIdhZoZj9eOg\r\n" - ] - } - ], + "outputs": [], "source": [ - "!ls ./data/William_Notman/works" + "!ls ./data/Jan_Van_Der_Heyden/works" ] }, { @@ -288,7 +261,7 @@ } ], "source": [ - "!ls ./data/William_Notman/works/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w/" + "!ls ./data/Jan_Van_Der_Heyden/works/country-house-on-the-vliet-near-delft-jan-van-de-heyden_3wEgj7D5Ld8nvg" ] }, { @@ -313,11 +286,11 @@ { "cell_type": "code", "execution_count": null, - "id": "8f4e1e93", + "id": "046657bf", "metadata": {}, "outputs": [], "source": [ - "img = mpimg.imread(\"./data/William_Notman/works/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w/artwork.png\")\n", + "img = mpimg.imread(\"./data/Jan_Van_Der_Heyden/works/country-house-on-the-vliet-near-delft-jan-van-de-heyden_3wEgj7D5Ld8nvg/artwork.png\")\n", "plt.imshow(img)\n", "plt.show()" ] @@ -340,12 +313,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"main_text\": \"\", \"title\": \"Mrs. William MacKenzie in Allan's conservatory, Montreal, QC\", \"date\": \"1871/1871\", \"type\": \"Photographie, Photograph\", \"titre\": \"Mme William MacKenzie, Montr\\u00e9al, QC\", \"photographer\": \"William Notman\", \"credit line\": \"Purchase from Associated Screen News Ltd., Achat de l'Associated Screen News Ltd.\", \"rights\": \"http://www.musee-mccord.qc.ca/en/orders/conditions/\", \"external link\": \" http://www.mccord-museum.qc.ca/en/collection/artifacts/I-63833\", \"medium\": \"Silver salts on glass - Wet collodion process, Plaque de verre au collodion humide\", \"id\": \"GQEUtJuLAhf54w\", \"link\": \"https://artsandculture.google.com/asset/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc/GQEUtJuLAhf54w\"}" + "{\"main_text\": \"The country house in the right middle ground has been identified as one which used to lie on the river Vliet, running between Delft and The Hague. Though this is possible, the house does not seem sufficiently distinctive to permit such a specific identification. This scene, however, depicts a fashionable part of Holland in the seventeenth century: a navigable canal or river with a well-kept towpath and a considerable volume of freight traffic. Lining the water are houses with plots of land extending into the flat, low-lying, fertile, reclaimed land. There is an alternation of elegant farmhouses, like the one with a stepped gable and hayrick, and buitenplaatsen (country houses), like the one nearer to us, with its ionic pilasters and dormer windows with scroll surrounds (as opposed to the more traditional gables). This house has a stone gate and a topiary hedge with claire-vues and an avenue of trees. Audrey Lambert reproduces a 1770 map of Rijswijk, between Delft and The Hague, which still shows exactly this alternation of simple plots and formal gardens extending into the polders on either side of the Vliet and nearby roads. This image by Heyden (1637-1712) is notable for its restrained depiction of evening light, with more white than gold in the spectrum and just a hint of pink in some of the clouds. But it is the vivid naturalism of the scene, with its matter-of-fact viewpoint, recording a public thoroughfare with no deference to the country house, which so remarkably anticipates the landscapes of the Impressionists. It is also possible that Constable had seen this painting when he painted his Scene on a Navigable River in 1816-17 (Tate, London), with its sparkling pointillist touch and scrupulous record of a working inland waterway.\", \"title\": \"Country House on the Vliet near Delft\", \"creator\": \"Jan van de Heyden\", \"creator lifespan\": \"1637 - 1712\", \"date created\": \"1665\", \"type\": \"Painting\", \"rights\": \"Supplied by Royal Collection Trust / (c) HM Queen Elizabeth II 2012\", \"external link\": \" http://www.rct.uk/collection/405948\", \"medium\": \"Oil on panel\", \"provenance\": \"Acquired by George IV when Prince Regent, 1814\", \"object description\": \"Beside a canal runs a road on which a huntsman walks his dog, with a country house & an outbuilding on the right; a mother and her children are seated by the road; in the centre a barge is moored to a landing-stage.\", \"id\": \"3wEgj7D5Ld8nvg\", \"link\": \"https://artsandculture.google.com/asset/country-house-on-the-vliet-near-delft-jan-van-de-heyden/3wEgj7D5Ld8nvg\"}" ] } ], "source": [ - "!cat ./data/William_Notman/works/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w//metadata.json" + "!cat ./data/Jan_Van_Der_Heyden/works/country-house-on-the-vliet-near-delft-jan-van-de-heyden_3wEgj7D5Ld8nvg/metadata.json" ] } ], diff --git a/examples/example_collect_all_artworks_error_handling_retrying.ipynb b/examples/example_collect_all_artworks_error_handling_retrying.ipynb deleted file mode 100644 index 23e21a6..0000000 --- a/examples/example_collect_all_artworks_error_handling_retrying.ipynb +++ /dev/null @@ -1,499 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d83d0691", - "metadata": {}, - "source": [ - "# Collect artworks and metadata for all artists" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "194ac6af", - "metadata": {}, - "outputs": [], - "source": [ - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8583c7c0", - "metadata": {}, - "outputs": [], - "source": [ - "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c34d9016", - "metadata": {}, - "outputs": [], - "source": [ - "# Maximum number of attempts to perform a task \n", - "max_retries = 10" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c7c7aed1", - "metadata": {}, - "outputs": [], - "source": [ - "min_wait_time = 10" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "8d21abe7", - "metadata": {}, - "outputs": [], - "source": [ - "# Artist Clementine Hunter, 27 artworks\n", - "artist_urls = ['https://artsandculture.google.com/entity/clementine-hunter/m0d1k7n']" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "43ae9afa", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory in which the data is to be stored\n", - "output_dir = './data'" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "eb623d6b", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork URL: https://artsandculture.google.com/asset/quilt-clementine-hunter/_gFkai2V-4ydag\n", - "artwork URL: https://artsandculture.google.com/asset/zinnias-in-a-pot-clementine-hunter/mwGD6E7Ek5acqQ\n", - "artwork URL: https://artsandculture.google.com/asset/melrose-quilt-clementine-hunter/zAEML8E_JHdZBw\n", - "artwork URL: https://artsandculture.google.com/asset/funeral-procession-clementine-hunter/3gHi9tMtAF2big\n", - "artwork URL: https://artsandculture.google.com/asset/chevron-quilt-clementine-hunter/UQGTB4IChKZ6Qw\n", - "artwork URL: https://artsandculture.google.com/asset/chaleur-the-sun-gives-life-to-everything-clementine-hunter/HwGBfvookbPqkQ\n", - "artwork URL: https://artsandculture.google.com/asset/flowers-clementine-hunter/zQERekxk8d_F8g\n", - "artwork URL: https://artsandculture.google.com/asset/crucifixion-with-angel-clementine-hunter/MwHjeBEoiNhbbQ\n", - "artwork URL: https://artsandculture.google.com/asset/cooking-out-clementine-hunter/rQGtvTjBIYIJ6Q\n", - "artwork URL: https://artsandculture.google.com/asset/panorama-of-baptism-on-cane-river-clementine-hunter/EwGOfLBe5vUg2g\n", - "artwork URL: https://artsandculture.google.com/asset/floral-mosaic-5-clementine-hunter/zQGnBVJlybWfzw\n", - "artwork URL: https://artsandculture.google.com/asset/chickens-clementine-hunter/0wEvGq3AgMynow\n", - "artwork URL: https://artsandculture.google.com/asset/birds-and-flowers-clementine-hunter/OAHgv0AFrobJPQ\n", - "artwork URL: https://artsandculture.google.com/asset/minding-baby-clementine-hunter/FQGa7FzDuGM0cg\n", - "artwork URL: https://artsandculture.google.com/asset/fall-fireworks-clementine-hunter/FwEDCbEb6A9hig\n", - "artwork URL: https://artsandculture.google.com/asset/farmlands-clementine-hunter/UwGchbC0dry5DA\n", - "artwork URL: https://artsandculture.google.com/asset/catus-in-a-red-bowl-clementine-hunter/cAH94kfbGdPPqQ\n", - "artwork URL: https://artsandculture.google.com/asset/flowers-in-a-jar-clementine-hunter/3gHpffPjlfmLQA\n", - "artwork URL: https://artsandculture.google.com/asset/street-of-the-neighborhood-clementine-hunter/zAG27Fcfy4v7AQ\n", - "artwork URL: https://artsandculture.google.com/asset/fish-in-the-ocean-clementine-hunter/_QFdEcDCjuN2Vg\n", - "artwork URL: https://artsandculture.google.com/asset/village-no-1-clementine-hunter/XgEyBhu4t7gXZQ\n", - "artwork URL: https://artsandculture.google.com/asset/fish-bowl-clementine-hunter/WQFNS6_tEf2jjg\n", - "artwork URL: https://artsandculture.google.com/asset/quilt-no-2-clementine-hunter/FAH2dn3Eh8QR_Q\n", - "artwork URL: https://artsandculture.google.com/asset/circus-clementine-hunter/AgHnJrgLFpQuCg\n", - "artwork URL: https://artsandculture.google.com/asset/quilt-clementine-hunter/_gFkai2V-4ydag\n", - "artwork URL: https://artsandculture.google.com/asset/feeding-birds-clementine-hunter/lwHj85ayu4zyBA\n", - "artwork URL: https://artsandculture.google.com/asset/wash-day-clementine-hunter/rgENCOZdm4aAKw\n", - "artwork URL: https://artsandculture.google.com/asset/birds-clementine-hunter/OQGQcPFtMUbT5Q\n", - "CPU times: user 725 ms, sys: 129 ms, total: 854 ms\n", - "Wall time: 35min\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "# Find_artworks for each artist\n", - "for artist_url in artist_urls:\n", - " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", - " min_wait_time=min_wait_time) as scraper:\n", - " # Save list of works, description, and metadata for an artist\n", - " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", - " # Create directory for this artist\n", - " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", - " # Get list of links to this artist's works \n", - " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", - " artwork_links = [line.rstrip() for line in file] \n", - " # Scrape artworks\n", - " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", - " # Go through each artwork link\n", - " for url in artwork_links:\n", - " print(f'artwork URL: {url}')\n", - " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" - ] - }, - { - "cell_type": "markdown", - "id": "3519fb2e", - "metadata": {}, - "source": [ - "# Display results" - ] - }, - { - "cell_type": "markdown", - "id": "f8cfeece", - "metadata": {}, - "source": [ - "## Display contents of data directory" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9f56346a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A._Y._Jackson Clementine_Hunter Hokusai\r\n" - ] - } - ], - "source": [ - "!ls data" - ] - }, - { - "cell_type": "markdown", - "id": "eff822f0", - "metadata": {}, - "source": [ - "## Display contents of directory for one artist" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "fc4d3d90", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "description.txt metadata.json\tworks works.txt\r\n" - ] - } - ], - "source": [ - "!ls ./data/Clementine_Hunter" - ] - }, - { - "cell_type": "markdown", - "id": "e0921cb7", - "metadata": {}, - "source": [ - "## Description of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "38079197", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clementine Hunter (pronounced Clementeen; late December 1886 or early January 1887 – January 1, 1988) was a self-taught Black folk artist from the Cane River region of Louisiana, who lived and worked on Melrose Plantation.\r\n", - "Hunter was born into a Louisiana Creole family at Hidden Hill Plantation near Cloutierville, in Natchitoches Parish, Louisiana. She started working as a farm laborer when young, and never learned to read or write. In her fifties, she began to sell her paintings, which soon gained local and national attention for their complexity in depicting Black Southern life in the early 20th century.\r\n", - "Initially she sold her first paintings for as little as 25 cents. But by the end of her life, her work was being exhibited in museums and sold by dealers for thousands of dollars. Clementine Hunter produced an estimated 5,000 to 10,000 paintings in her lifetime. Hunter was granted an honorary Doctor of Fine Arts degree by Northwestern State University of Louisiana in 1986, and she is the first African-American artist to have a solo exhibition at the present-day New Orleans Museum of Art. In 2013, director Robert Wilson presented a new opera about her, entitled Zinnias: the Life of Clementine Hunter, at Montclair State University in New Jersey." - ] - } - ], - "source": [ - "!cat ./data/Clementine_Hunter/description.txt" - ] - }, - { - "cell_type": "markdown", - "id": "322e9c5b", - "metadata": {}, - "source": [ - "## Metadata of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e5ef192", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"family name\": \"Hunter\", \"given name\": [\"Clementine\", \"Clementina\"], \"sex or gender\": \"female\", \"date of birth\": [\"1889-01-01\", \"1886-01-01\"], \"place of birth\": [\"Cloutierville\", \"Louisiana\"], \"latitude of place of birth\": [\"31.5433\", \"31.0\"], \"longitude of place of birth\": [\"-92.9183\", \"-92.0\"], \"date of death\": [\"1889-01-01\", \"1886-01-01\"], \"place of death\": [\"Natchitoches\", \"Natchitoches Parish\"], \"latitude of place of death\": [\"31.7431\", \"31.73\"], \"longitude of place of death\": [\"-93.095\", \"-93.1\"], \"country of citizenship\": \"United States of America\", \"residence\": \"\", \"work location\": \"\", \"genre\": \"portrait\", \"movement\": \"\", \"occupation\": [\"artist\", \"painter\"]}" - ] - } - ], - "source": [ - "!cat ./data/Clementine_Hunter/metadata.json" - ] - }, - { - "cell_type": "markdown", - "id": "63251f32", - "metadata": {}, - "source": [ - "## Directory containing works of this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "1cd0d995", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "birds-and-flowers-clementine-hunter_OAHgv0AFrobJPQ\r\n", - "birds-clementine-hunter_OQGQcPFtMUbT5Q\r\n", - "catus-in-a-red-bowl-clementine-hunter_cAH94kfbGdPPqQ\r\n", - "chaleur-the-sun-gives-life-to-everything-clementine-hunter_HwGBfvookbPqkQ\r\n", - "chevron-quilt-clementine-hunter_UQGTB4IChKZ6Qw\r\n", - "chickens-clementine-hunter_0wEvGq3AgMynow\r\n", - "circus-clementine-hunter_AgHnJrgLFpQuCg\r\n", - "cooking-out-clementine-hunter_rQGtvTjBIYIJ6Q\r\n", - "crucifixion-with-angel-clementine-hunter_MwHjeBEoiNhbbQ\r\n", - "fall-fireworks-clementine-hunter_FwEDCbEb6A9hig\r\n", - "farmlands-clementine-hunter_UwGchbC0dry5DA\r\n", - "feeding-birds-clementine-hunter_lwHj85ayu4zyBA\r\n", - "fish-bowl-clementine-hunter_WQFNS6_tEf2jjg\r\n", - "fish-in-the-ocean-clementine-hunter__QFdEcDCjuN2Vg\r\n", - "floral-mosaic-5-clementine-hunter_zQGnBVJlybWfzw\r\n", - "flowers-clementine-hunter_zQERekxk8d_F8g\r\n", - "flowers-in-a-jar-clementine-hunter_3gHpffPjlfmLQA\r\n", - "funeral-procession-clementine-hunter_3gHi9tMtAF2big\r\n", - "melrose-quilt-clementine-hunter_zAEML8E_JHdZBw\r\n", - "minding-baby-clementine-hunter_FQGa7FzDuGM0cg\r\n", - "panorama-of-baptism-on-cane-river-clementine-hunter_EwGOfLBe5vUg2g\r\n", - "quilt-clementine-hunter__gFkai2V-4ydag\r\n", - "quilt-no-2-clementine-hunter_FAH2dn3Eh8QR_Q\r\n", - "street-of-the-neighborhood-clementine-hunter_zAG27Fcfy4v7AQ\r\n", - "village-no-1-clementine-hunter_XgEyBhu4t7gXZQ\r\n", - "wash-day-clementine-hunter_rgENCOZdm4aAKw\r\n", - "zinnias-in-a-pot-clementine-hunter_mwGD6E7Ek5acqQ\r\n" - ] - } - ], - "source": [ - "!ls ./data/Clementine_Hunter/works" - ] - }, - { - "cell_type": "markdown", - "id": "4c20d8c2", - "metadata": {}, - "source": [ - "## Directory containing one artwork by this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "256919d3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork.png metadata.json\r\n" - ] - } - ], - "source": [ - "!ls ./data/Clementine_Hunter/works/flowers-clementine-hunter_zQERekxk8d_F8g" - ] - }, - { - "cell_type": "markdown", - "id": "6829e0a2", - "metadata": {}, - "source": [ - "## Display metadata for this artwork" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "b5504ef7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"main_text\": \"Oil on canvas still life painting of a vase of flowers. The brightly colored orange vase has a handle on the right side. It contains green foliage and red, yellow, orange, and white flowers. The background is a dark brownish black.\", \"title\": \"Flowers\", \"creator\": \"Clementine Hunter\", \"date created\": \"1973\", \"id\": \"zQERekxk8d_F8g\", \"link\": \"https://artsandculture.google.com/asset/flowers-clementine-hunter/zQERekxk8d_F8g\"}" - ] - } - ], - "source": [ - "!cat ./data/Clementine_Hunter/works/flowers-clementine-hunter_zQERekxk8d_F8g/metadata.json" - ] - }, - { - "cell_type": "markdown", - "id": "2810ec59", - "metadata": {}, - "source": [ - "## Checking that too-long file/directory names now work" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c783a9ad", - "metadata": {}, - "outputs": [], - "source": [ - "artwork_links = ['https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg']" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "e38b6ff7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork URL: https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\n" - ] - } - ], - "source": [ - "with GoogleArtScraper(output_dir + '/' + 'Hokusai' + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", - " # Go through each artwork link\n", - " for url in artwork_links:\n", - " print(f'artwork URL: {url}')\n", - " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" - ] - }, - { - "cell_type": "markdown", - "id": "77f7868f", - "metadata": {}, - "source": [ - "## Directory containing works of this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "5bd7654a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai_hg\r\n" - ] - } - ], - "source": [ - "!ls ./data/Hokusai/works" - ] - }, - { - "cell_type": "markdown", - "id": "2a521a4c", - "metadata": {}, - "source": [ - "## Directory containing one artwork by this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7a85d111", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork.png metadata.json\r\n" - ] - } - ], - "source": [ - "!ls ./data/Hokusai/works/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai_hg" - ] - }, - { - "cell_type": "markdown", - "id": "76b31701", - "metadata": {}, - "source": [ - "## Display metadata for this artwork" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "2e5d6e6d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"main_text\": \"\", \"title\": \"\\u51a8\\u5dbd\\u4e09\\u5341\\u516d\\u666f\\u3000\\u7532\\u5dde\\u77f3\\u73ed\\u6ca2|Kajikazawa in Kai Province (K\\u014dsh\\u016b Kajikazawa), from the series Thirty-six Views of Mount Fuji (Fugaku sanj\\u016brokkei)\", \"creator\": \"Katsushika Hokusai\", \"date created\": \"ca. 1830\\u201332\", \"physical dimensions\": \"10 1/4 x 15 1/8 in. (26 x 38.4 cm)\", \"type\": \"Woodblock print\", \"external link\": \" http://www.metmuseum.org/art/collection/search/39800\", \"medium\": \"Polychrome woodblock print; ink and color on paper\", \"repository\": \"Metropolitan Museum of Art, New York, NY\", \"period\": \"Edo period (1615\\u20131868)\", \"culture\": \"Japan\", \"id\": \"hgHQaDeXBcllwg\", \"link\": \"https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\"}" - ] - } - ], - "source": [ - "!cat ./data/Hokusai/works/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai_hg/metadata.json" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/example_collect_all_artworks_url_decoding.ipynb b/examples/example_collect_all_artworks_url_decoding.ipynb deleted file mode 100644 index cd7a5e8..0000000 --- a/examples/example_collect_all_artworks_url_decoding.ipynb +++ /dev/null @@ -1,465 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d83d0691", - "metadata": {}, - "source": [ - "# Example artist: Zdeněk Sýkora " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "194ac6af", - "metadata": {}, - "outputs": [], - "source": [ - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8583c7c0", - "metadata": {}, - "outputs": [], - "source": [ - "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c34d9016", - "metadata": {}, - "outputs": [], - "source": [ - "# Maximum number of attempts to perform a task \n", - "max_retries = 3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c7c7aed1", - "metadata": {}, - "outputs": [], - "source": [ - "min_wait_time = 10" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "43ae9afa", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory in which the data is to be stored\n", - "output_dir = './data'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "8d21abe7", - "metadata": {}, - "outputs": [], - "source": [ - "# Artist Zdeněk Sýkora, 3 artworks\n", - "artist_urls = ['https://artsandculture.google.com/entity/zden%C4%9Bk-s%C3%BDkora/m0gyrctv']" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "eb623d6b", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WDM] - Downloading: 19.2kB [00:00, 9.56MB/s] \n", - "[WDM] - Downloading: 100%|█████████████████| 2.93M/2.93M [00:00<00:00, 75.4MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork URL: https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\n", - "artwork URL: https://artsandculture.google.com/asset/black-lines-zden%C4%9Bk-s%C3%BDkora/NwEzX_kYKTNDOA\n", - "artwork URL: https://artsandculture.google.com/asset/black-and-white-structure-zdenek-sykora/6gGt7GEf2h9Yuw\n", - "artwork URL: https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\n", - "CPU times: user 267 ms, sys: 35 ms, total: 302 ms\n", - "Wall time: 11 s\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "# Find_artworks for each artist\n", - "for artist_url in artist_urls:\n", - " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", - " min_wait_time=min_wait_time) as scraper:\n", - " # Save list of works, description, and metadata for an artist\n", - " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", - " # Create directory for this artist\n", - " artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n", - " # Get list of links to this artist's works \n", - " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", - " artwork_links = [line.rstrip() for line in file] \n", - " # Scrape artworks\n", - " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", - " # Go through each artwork link\n", - " for url in artwork_links:\n", - " print(f'artwork URL: {url}')\n", - " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" - ] - }, - { - "cell_type": "markdown", - "id": "3519fb2e", - "metadata": {}, - "source": [ - "## Display results" - ] - }, - { - "cell_type": "markdown", - "id": "f8cfeece", - "metadata": {}, - "source": [ - "### Display contents of data directory" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9f56346a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A._Y._Jackson Clementine_Hunter Hokusai Hokusai_old\tZdeněk_Sýkora\r\n" - ] - } - ], - "source": [ - "!ls data" - ] - }, - { - "cell_type": "markdown", - "id": "eff822f0", - "metadata": {}, - "source": [ - "### Display contents of directory for one artist" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "fc4d3d90", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "description.txt metadata.json\tworks works.txt\r\n" - ] - } - ], - "source": [ - "!ls ./data/Zdeněk_Sýkora" - ] - }, - { - "cell_type": "markdown", - "id": "e0921cb7", - "metadata": {}, - "source": [ - "### Description of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "38079197", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Zdeněk Sýkora (February 3, 1920 – July 12, 2011) was a Czechoslovakian modern abstract painter and sculptor, and a pioneer of using computers in art." - ] - } - ], - "source": [ - "!cat ./data/Zdeněk_Sýkora/description.txt" - ] - }, - { - "cell_type": "markdown", - "id": "322e9c5b", - "metadata": {}, - "source": [ - "### Metadata of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e5ef192", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"family name\": \"Sýkora\", \"given name\": \"Zdeněk\", \"sex or gender\": \"male\", \"date of birth\": \"1920-02-03\", \"place of birth\": [\"Paceřice\", \"Louny\"], \"latitude of place of birth\": [\"50.619292049\", \"50.357078457\"], \"longitude of place of birth\": [\"15.113627963\", \"13.796762432\"], \"date of death\": \"2011-07-12\", \"place of death\": \"Louny\", \"latitude of place of death\": \"50.357078457\", \"longitude of place of death\": \"13.796762432\", \"country of citizenship\": [\"Czech Republic\", \"Czechoslovakia\"], \"residence\": \"\", \"work location\": \"Louny\", \"genre\": \"\", \"movement\": \"\", \"occupation\": [\"teacher\", \"architect\", \"painter\", \"sculptor\", \"graphic artist\"]}" - ] - } - ], - "source": [ - "!cat ./data/Zdeněk_Sýkora/metadata.json" - ] - }, - { - "cell_type": "markdown", - "id": "63251f32", - "metadata": {}, - "source": [ - "### Directory containing works of this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "1cd0d995", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "black-and-white-structure-zdenek-sykora_6gGt7GEf2h9Yuw\r\n", - "black-lines-zdeněk-sýkora_NwEzX_kYKTNDOA\r\n", - "line-no-56-humberto-zdeněk-sýkora_XgGPSy53OVWyaw\r\n" - ] - } - ], - "source": [ - "!ls ./data/Zdeněk_Sýkora/works" - ] - }, - { - "cell_type": "markdown", - "id": "4c20d8c2", - "metadata": {}, - "source": [ - "### Directory containing one artwork by this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "256919d3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork.png metadata.json\r\n" - ] - } - ], - "source": [ - "!ls ./data/Zdeněk_Sýkora/works/line-no-56-humberto-zdeněk-sýkora_XgGPSy53OVWyaw" - ] - }, - { - "cell_type": "markdown", - "id": "6829e0a2", - "metadata": {}, - "source": [ - "### Display metadata for this artwork" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "b5504ef7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"main_text\": \"The Fifty-sixth Line (in this simple way the artist has been naming his paintings since 1974) takes an important place in Sýkora´s work baceuse of its vigorous colour scheme. The first pictures on this theme were painted in 1973, when the preceding and no less exciting stage of his work was closed - the ten-year-long period of structures. As early as 1964, Zdeněk Sýkora, probably the first painter in the history of art, made use of the computer as an auxiliary tool in the solution of the composition of the picture. The programme for determining the position of each element in the structure was devised jointly with the mathematician Jaroslav Blažek. The structures were subject to strict, predetermined rules, the computer kept the rules and solved the assignment. On the other hand the \\\"earthworms\\\" - as the artist sometimes called his line pictures - are based on the application of chance in the system evolved by the artist. In 1985 his wife Lenka began to cooperate on its development. Thus each picture at firts existed as a numerical score, which was then transferred in classical way, by paint, on the canvas, as if a composition was created from the music notes. And why did the painter give the picture another name? He says that he himself was suprised by the way in which the \\\"crazy\\\" picure reminiscent of a circus had originated. The painting thus went beyond his expectations in all respects, he even had to get accustomed to the strange combinations of colours so that for one year Humberto was stored in the next room. In the end, it became a favourite representative of Sýkora´s art.\", \"title\": \"Line no. 56 (Humberto)\", \"creator\": \"Zdeněk Sýkora\", \"date\": \"1988\", \"physical dimensions\": \"200 x 200 cm\", \"medium\": \"acrylic, canvas\", \"id\": \"XgGPSy53OVWyaw\", \"link\": \"https://artsandculture.google.com/asset/line-no-56-humberto-zden%C4%9Bk-s%C3%BDkora/XgGPSy53OVWyaw\"}" - ] - } - ], - "source": [ - "!cat ./data/Zdeněk_Sýkora/works/line-no-56-humberto-zdeněk-sýkora_XgGPSy53OVWyaw/metadata.json" - ] - }, - { - "cell_type": "markdown", - "id": "2810ec59", - "metadata": {}, - "source": [ - "# Example artwork: Kajikazawa in Kai Province, from the series thirty-six views of Mount Fuji " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c783a9ad", - "metadata": {}, - "outputs": [], - "source": [ - "artwork_links = ['https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg']" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "e38b6ff7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork URL: https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\n" - ] - } - ], - "source": [ - "with GoogleArtScraper(output_dir + '/' + 'Hokusai' + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", - " # Go through each artwork link\n", - " for url in artwork_links:\n", - " print(f'artwork URL: {url}')\n", - " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" - ] - }, - { - "cell_type": "markdown", - "id": "8fb60484", - "metadata": {}, - "source": [ - "## Display results" - ] - }, - { - "cell_type": "markdown", - "id": "77f7868f", - "metadata": {}, - "source": [ - "### Directory containing works of this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "5bd7654a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "冨嶽三十六景 甲州石班沢-kajikazawa-in-kai-province-kōshū-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanjūrokkei-katsushika-hokusai_hgHQaDeXBcllwg\r\n" - ] - } - ], - "source": [ - "!ls ./data/Hokusai/works" - ] - }, - { - "cell_type": "markdown", - "id": "2a521a4c", - "metadata": {}, - "source": [ - "### Directory containing one artwork by this artist" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7a85d111", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork.png metadata.json\r\n" - ] - } - ], - "source": [ - "!ls ./data/Hokusai/works/冨嶽三十六景 甲州石班沢-kajikazawa-in-kai-province-kōshū-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanjūrokkei-katsushika-hokusai_hgHQaDeXBcllwg" - ] - }, - { - "cell_type": "markdown", - "id": "76b31701", - "metadata": {}, - "source": [ - "### Display metadata for this artwork" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "2e5d6e6d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"main_text\": \"\", \"title\": \"冨嶽三十六景 甲州石班沢|Kajikazawa in Kai Province (Kōshū Kajikazawa), from the series Thirty-six Views of Mount Fuji (Fugaku sanjūrokkei)\", \"creator\": \"Katsushika Hokusai\", \"date created\": \"ca. 1830–32\", \"physical dimensions\": \"10 1/4 x 15 1/8 in. (26 x 38.4 cm)\", \"type\": \"Woodblock print\", \"external link\": \" http://www.metmuseum.org/art/collection/search/39800\", \"medium\": \"Polychrome woodblock print; ink and color on paper\", \"repository\": \"Metropolitan Museum of Art, New York, NY\", \"period\": \"Edo period (1615–1868)\", \"culture\": \"Japan\", \"id\": \"hgHQaDeXBcllwg\", \"link\": \"https://artsandculture.google.com/asset/%E5%86%A8%E5%B6%BD%E4%B8%89%E5%8D%81%E5%85%AD%E6%99%AF%E3%80%80%E7%94%B2%E5%B7%9E%E7%9F%B3%E7%8F%AD%E6%B2%A2-kajikazawa-in-kai-province-k%C5%8Dsh%C5%AB-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanj%C5%ABrokkei-katsushika-hokusai/hgHQaDeXBcllwg\"}" - ] - } - ], - "source": [ - "!cat ./data/Hokusai/works/冨嶽三十六景 甲州石班沢-kajikazawa-in-kai-province-kōshū-kajikazawa-from-the-series-thirty-six-views-of-mount-fuji-fugaku-sanjūrokkei-katsushika-hokusai_hgHQaDeXBcllwg/metadata.json" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/example_wikipedia_issues_resolved.ipynb b/examples/example_wikipedia_issues_resolved.ipynb deleted file mode 100644 index 56ef45d..0000000 --- a/examples/example_wikipedia_issues_resolved.ipynb +++ /dev/null @@ -1,338 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d83d0691", - "metadata": {}, - "source": [ - "# Artist without a description, and artist with a non-English Wikipedia article " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "194ac6af", - "metadata": {}, - "outputs": [], - "source": [ - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8583c7c0", - "metadata": {}, - "outputs": [], - "source": [ - "from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c34d9016", - "metadata": {}, - "outputs": [], - "source": [ - "# Maximum number of attempts to perform a task \n", - "max_retries = 3" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c7c7aed1", - "metadata": {}, - "outputs": [], - "source": [ - "min_wait_time = 10" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "43ae9afa", - "metadata": {}, - "outputs": [], - "source": [ - "# Directory in which the data is to be stored\n", - "output_dir = './data'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "8d21abe7", - "metadata": {}, - "outputs": [], - "source": [ - "# Artist urls\n", - "artist_urls = ['https://artsandculture.google.com/entity/esther-teichmann/g113vf7r7v',\n", - " 'https://artsandculture.google.com/entity/cornelis-albertus-johannes-schermer/g11bw5_6rgd']" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "eb623d6b", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/3gEvP4cE7Pyrag\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/3gGO_pWJeaxCQg\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/5wGcqEv2h2Cv6Q\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/3gEvP4cE7Pyrag\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/QgGLOdNbTYmr7g\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/dQEUb6SPOTxllQ\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/vgHV_6ie8P_Pcw\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/ygGTujVCOMXcWA\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/IwEn4wyeGt_tHA\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/fwGoPLZSHtLn2A\n", - "artwork URL: https://artsandculture.google.com/asset/installation-view-esther-teichmann-heavy-the-sea-esther-teichmann/zAE5-HfupzhWJg\n", - "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/cwEOXoCnybuhYA\n", - "artwork URL: https://artsandculture.google.com/asset/ruiter-in-uniform-van-achter-gezien-schermer-cornelis-albertus-johannes/cwG4U2LYgzpmkQ\n", - "artwork URL: https://artsandculture.google.com/asset/landschap-met-cavaleristen-schermer-cornelis-albertus-johannes/YwG0PJ20ZMTI-Q\n", - "artwork URL: https://artsandculture.google.com/asset/ruiter-in-uniform-van-achteren-gezien-schermer-cornelis-albertus-johannes/0wGa8wyBsKqwIg\n", - "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/mAHxJYt_tYhFqw\n", - "artwork URL: https://artsandculture.google.com/asset/landschap-met-cavaleristen-schermer-cornelis-albertus-johannes/KwFJL6DQpYzK1w\n", - "artwork URL: https://artsandculture.google.com/asset/cavaleriepaard-en-een-lancier-schermer-cornelis-albertus-johannes/FQGMgcNN9og85g\n", - "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/cwEOXoCnybuhYA\n", - "artwork URL: https://artsandculture.google.com/asset/weiland-met-een-paard-en-twee-koeien-schermer-cornelis-albertus-johannes/NwHUHRjHrk-tWw\n", - "artwork URL: https://artsandculture.google.com/asset/ruiters-met-paarden-schermer-cornelis-albertus-johannes/3wHC3D3PLFXz9w\n", - "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/7QGRGjdJFlusww\n", - "artwork URL: https://artsandculture.google.com/asset/landschap-met-cavaleristen-schermer-cornelis-albertus-johannes/oAFFsCDCMIg2sw\n", - "artwork URL: https://artsandculture.google.com/asset/paard-krijgt-hoefijzers-in-een-smederij-schermer-cornelis-albertus-johannes/cQH-x8lIGO0IPQ\n", - "CPU times: user 102 ms, sys: 66.8 ms, total: 169 ms\n", - "Wall time: 16.9 s\n" - ] - } - ], - "source": [ - "%%time \n", - "\n", - "# Find_artworks for each artist\n", - "for artist_url in artist_urls:\n", - " with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n", - " min_wait_time=min_wait_time) as scraper:\n", - " # Save list of works, description, and metadata for an artist\n", - " retry(scraper.save_artist_information, max_retries, min_wait_time)\n", - " # Create directory for this artist\n", - " artist_dir = output_dir + '/' + scraper.get_artist_name() \n", - " # Get list of links to this artist's works \n", - " with open(artist_dir+'/'+'works.txt', 'r') as file:\n", - " artwork_links = [line.rstrip() for line in file] \n", - " # Scrape artworks\n", - " with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n", - " # Go through each artwork link\n", - " for url in artwork_links:\n", - " print(f'artwork URL: {url}')\n", - " retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)" - ] - }, - { - "cell_type": "markdown", - "id": "3519fb2e", - "metadata": {}, - "source": [ - "## Display results" - ] - }, - { - "cell_type": "markdown", - "id": "f8cfeece", - "metadata": {}, - "source": [ - "### Display contents of data directory" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9f56346a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cornelis_Albertus_Johannes_Schermer Esther_Teichmann\r\n" - ] - } - ], - "source": [ - "!ls data" - ] - }, - { - "cell_type": "markdown", - "id": "eff822f0", - "metadata": {}, - "source": [ - "### Display contents of directory for artist Esther Teichmann" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "fc4d3d90", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "description.txt metadata.json\tworks works.txt\r\n" - ] - } - ], - "source": [ - "!ls ./data/Esther_Teichmann" - ] - }, - { - "cell_type": "markdown", - "id": "e0921cb7", - "metadata": {}, - "source": [ - "#### Description of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "38079197", - "metadata": {}, - "outputs": [], - "source": [ - "!cat ./data/Esther_Teichmann/description.txt" - ] - }, - { - "cell_type": "markdown", - "id": "322e9c5b", - "metadata": {}, - "source": [ - "#### Metadata of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e5ef192", - "metadata": {}, - "outputs": [], - "source": [ - "!cat ./data/Esther_Teichmann/metadata.json" - ] - }, - { - "cell_type": "markdown", - "id": "163c0111", - "metadata": {}, - "source": [ - "### Display contents of directory for artist Cornelis Albertus Johannes Schermer" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "722d2e48", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "description.txt metadata.json\tworks works.txt\r\n" - ] - } - ], - "source": [ - "!ls ./data/Cornelis_Albertus_Johannes_Schermer" - ] - }, - { - "cell_type": "markdown", - "id": "719e56e1", - "metadata": {}, - "source": [ - "#### Description of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1e0fb399", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cornelis Albertus Johannes Schermer (* 12. Juni 1824 in Den Haag; † 4. Januar 1915 ebenda) war ein niederländischer Pferdemaler und Radierer.\r\n", - "Schermer war von 1841 bis 1844 Student der Koninklijke Academie van Beeldende Kunsten in Den Haag unter der Leitung von Cornelis Kruseman und Jacobus Everhardus Josephus van den Berg, Er wurde auch von Joseph Moerenhout (1801–1874) beraten.\r\n", - "1875 belebte er ein Vedutengemälde von Carel Jacobus Behr mit Figuren. \r\n", - "Schermer war in Den Haag und von 1880 bis 1903 in Bouvignies bei Dinant tätig.\r\n", - "Vön 1846 bis 1903 zeigte er seine Werke auf den Ausstellungen in Amsterdam. Den Haag und Rotterdam, signierte seine Werke mit „C. Schermer“." - ] - } - ], - "source": [ - "!cat ./data/Cornelis_Albertus_Johannes_Schermer/description.txt" - ] - }, - { - "cell_type": "markdown", - "id": "93fad724", - "metadata": {}, - "source": [ - "#### Metadata of artist" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "a4c5595a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"family name\": \"\", \"given name\": \"Cornelis\", \"pseudonym\": \"\", \"sex or gender\": \"male\", \"date of birth\": \"1824-06-12\", \"place of birth\": \"The Hague\", \"latitude of place of birth\": \"52.08\", \"longitude of place of birth\": \"4.31\", \"date of death\": \"1915-01-04\", \"place of death\": \"The Hague\", \"latitude of place of death\": \"52.08\", \"longitude of place of death\": \"4.31\", \"country of citizenship\": \"Kingdom of the Netherlands\", \"residence\": \"\", \"work location\": \"\", \"genre\": \"\", \"movement\": \"\", \"occupation\": \"painter\"}" - ] - } - ], - "source": [ - "!cat ./data/Cornelis_Albertus_Johannes_Schermer/metadata.json" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}