Skip to content

Commit

Permalink
Fixed filename too long and empty image file errors, removed redundan…
Browse files Browse the repository at this point in the history
…t example notebooks (#29)

* fix filename-too-long issue

* check for empty artwork file

* remove redundant notebooks

* add SHA1 hash to directory names for long urls

* change filename length limit for truncation
  • Loading branch information
modhurita authored Nov 7, 2023
1 parent 05ccad3 commit fc13ed6
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 1,385 deletions.
45 changes: 38 additions & 7 deletions artscraper/googleart.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from urllib.parse import urlparse
from urllib.parse import unquote

import hashlib

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
Expand Down Expand Up @@ -45,27 +47,49 @@ def load_link(self, link):
if link == self.link:
return False
self.link = link

if self.output_dir is not None:
if (self.paint_dir.is_dir() and self.skip_existing
and Path(self.paint_dir, "metadata.json").is_file()
and Path(self.paint_dir, "artwork.png").is_file()):
and Path(self.paint_dir, "artwork.png").is_file()
and Path(self.paint_dir, "artwork.png").stat().st_size>0):
return False
self.paint_dir.mkdir(exist_ok=True, parents=True)

self.wait(self.min_wait)
self.driver.get(link)
return True

@property
def paint_dir(self):

paint_id = "_".join(urlparse(self.link).path.split("/")[-2:])

# Prevent problems with character encoding/decoding
paint_id = unquote(paint_id)
# Prevent problems with too-long file/directory names
if len(paint_id)>=256:
paint_id = paint_id[0:255]
# Byte string
paint_id_encoded = paint_id.encode('utf-8')
# Length of directory name in bytes
byte_length = len(paint_id_encoded)

# Prevent problems with too-long directory names
# 255 bytes is the maximum length of a directory on Windows

# Set maximum length for the part of the directory name derived from the
# Google Arts & Culture url for the artwork
max_byte_length = 240
hash_length = 40
if byte_length >= max_byte_length:
truncated_byte_string = paint_id_encoded[:max_byte_length-hash_length]
# Decode back to string, handling possible incomplete character at the end
while True:
try:
truncated_directory_name = truncated_byte_string.decode('utf-8')
break
except UnicodeDecodeError:
# Remove the last byte and try again
truncated_byte_string = truncated_byte_string[:-1]
# Create hopefully-unique directory name that doesn't exceed
# maximum allowed directory length
paint_id = truncated_directory_name + '_' + hashlib.sha1(paint_id_encoded).hexdigest()

return Path(self.output_dir, paint_id)

Expand Down Expand Up @@ -137,14 +161,19 @@ def get_image(self):
self.wait(self.min_wait)
elem = self.driver.find_element(
"xpath", "/html/body/div[3]/div[3]/div/div/div[2]/div[3]")

webdriver.ActionChains(
self.driver).move_to_element(elem).click(elem).perform()

self.wait(self.min_wait * 2, update=False)
elem = self.driver.find_element(
"xpath", "/html/body/div[3]/div[3]/div/div/div[2]/div[3]")

img = elem.screenshot_as_png

self.wait(self.min_wait)
self.driver.find_element("xpath", "/html/body").send_keys(Keys.ESCAPE)

return img

def save_image(self, img_fp=None, link=None):
Expand All @@ -164,11 +193,13 @@ def save_image(self, img_fp=None, link=None):

img_fp = self._convert_img_fp(img_fp, suffix=".png")

if self.skip_existing and img_fp.is_file():
if self.skip_existing and img_fp.is_file() and img_fp.stat().st_size!=0:
return

with open(img_fp, "wb") as f:
f.write(self.get_image())


def save_artwork_information(self, link):
"""
Given an artwork link, saves the image and the associated metadata.
Expand Down
125 changes: 49 additions & 76 deletions examples/example_collect_all_artworks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,12 @@
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "77cd3e7d",
"cell_type": "raw",
"id": "3718c2c0",
"metadata": {},
"outputs": [],
"source": [
"# Get links for all artists, as a list\n",
"#artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')"
"artist_urls = get_artist_links(executable_path='geckodriver', min_wait_time=1, output_file='artist_links.txt')"
]
},
{
Expand All @@ -37,29 +35,38 @@
"# Collect artworks and metadata for all artists"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "dadfbb4d",
"metadata": {},
"outputs": [],
"source": [
"from artscraper import GoogleArtScraper, FindArtworks, random_wait_time, retry"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8583c7c0",
"id": "c3a16846",
"metadata": {},
"outputs": [],
"source": [
"from artscraper import GoogleArtScraper, FindArtworks"
"# Sample artist link, for illustration purposes\n",
"artist_urls = ['https://artsandculture.google.com/entity/jan-van-der-heyden/m05g5_1']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c3a16846",
"id": "bba4ec54",
"metadata": {},
"outputs": [],
"source": [
"# Small subset of artist links, for illustration purposes\n",
"# (3 artists, each with < 5 artworks)\n",
"artist_urls = ['https://artsandculture.google.com/entity/william-notman/m04mpzj',\n",
" 'https://artsandculture.google.com/entity/alexander-keirincx/m03cxjmm',\n",
" 'https://artsandculture.google.com/entity/abraham-lambertsz-van-den-tempel/m09g78pg'\n",
" ]"
"# Maximum number of attempts to perform a task \n",
"max_retries = 3\n",
"# Minimum time (in seconds) to wait before retrying\n",
"min_wait_time = 10"
]
},
{
Expand All @@ -75,38 +82,30 @@
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2be1700c",
"execution_count": null,
"id": "09a90d85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 389 ms, sys: 37.9 ms, total: 427 ms\n",
"Wall time: 10min 18s\n"
]
}
],
"outputs": [],
"source": [
"%%time \n",
"\n",
"# Find_artworks for each artist\n",
"for artist_url in artist_urls:\n",
" with FindArtworks(artist_link=artist_url, output_dir=output_dir, min_wait_time=10) as scraper:\n",
" with FindArtworks(artist_link=artist_url, output_dir=output_dir, \n",
" min_wait_time=min_wait_time) as scraper:\n",
" # Save list of works, description, and metadata for an artist\n",
" scraper.save_artist_information()\n",
" # Get list of links to this artist's works \n",
" artwork_links = scraper.get_artist_works()\n",
" retry(scraper.save_artist_information, max_retries, min_wait_time)\n",
" # Create directory for this artist\n",
" artist_dir = output_dir + '/' + scraper.get_wikipedia_article_title() \n",
" artist_dir = output_dir + '/' + scraper.get_artist_name() \n",
" # Get list of links to this artist's works \n",
" with open(artist_dir+'/'+'works.txt', 'r') as file:\n",
" artwork_links = [line.rstrip() for line in file] \n",
" # Scrape artworks\n",
" with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=10) as subscraper:\n",
" with GoogleArtScraper(artist_dir + '/' + 'works', min_wait=min_wait_time) as subscraper:\n",
" # Go through each artwork link\n",
" for url in artwork_links:\n",
" subscraper.load_link(url)\n",
" subscraper.save_metadata()\n",
" subscraper.save_image()"
" print(f'artwork URL: {url}')\n",
" retry(subscraper.save_artwork_information, max_retries, min_wait_time, url)"
]
},
{
Expand Down Expand Up @@ -135,7 +134,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Abraham_Lambertsz_van_den_Tempel Alexander_Keirincx William_Notman\r\n"
"Jan_Van_Der_Heyden Juana_Alicia\r\n"
]
}
],
Expand All @@ -161,12 +160,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"description.txt metadata.txt works works.txt\r\n"
"description.txt metadata.json\tworks works.txt\r\n"
]
}
],
"source": [
"!ls ./data/William_Notman/"
"!ls ./data/Jan_Van_Der_Heyden/"
]
},
{
Expand All @@ -187,12 +186,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"William Notman (8 March 1826 – 25 November 1891) was a Scottish-Canadian photographer and businessman. The Notman House in Montreal was his home from 1876 until his death in 1891, and it has since been named after him. Notman was the first photographer in Canada to achieve international recognition."
"Jan van der Heyden (5 March 1637, Gorinchem – 28 March 1712, Amsterdam) was a Dutch Baroque-era painter, glass painter, draughtsman and printmaker. Van der Heyden was one of the first Dutch painters to specialize in townscapes and became one of the leading architectural painters of the Dutch Golden Age. He painted a number of still lifes in the beginning and at the end of his career.Jan van der Heyden was also an engineer and inventor who made significant contributions to contemporary firefighting technology. Together with his brother Nicolaes, who was a hydraulic engineer, he invented an improvement of the fire hose in 1672. He modified the manual fire engine, reorganised the volunteer fire brigade (1685) and wrote and illustrated the first firefighting manual (Brandspuiten-boek). A comprehensive street lighting scheme for Amsterdam, designed and implemented by van der Heyden, remained in operation from 1669 until 1840 and was adopted as a model by many other towns and abroad."
]
}
],
"source": [
"!cat ./data/William_Notman/description.txt"
"!cat ./data/Jan_Van_Der_Heyden/description.txt"
]
},
{
Expand All @@ -213,27 +212,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"family name : Notman\r\n",
"given name : William\r\n",
"sex or gender : male\r\n",
"date of birth : 1826-03-08\r\n",
"place of birth : Paisley\r\n",
"latitude of place of birth : 55.845555555\r\n",
"longitude of place of birth : -4.423888888\r\n",
"date of death : 1826-03-08\r\n",
"place of death : Montreal\r\n",
"latitude of place of death : 45.508888888\r\n",
"longitude of place of death : -73.561666666\r\n",
"country of citizenship : ['Canada', 'United Kingdom of Great Britain and Ireland']\r\n",
"residence : Canada\r\n",
"work location : ['New York City', 'Boston', 'Toronto', 'Montreal', 'Philadelphia', 'Ottawa', 'Halifax', 'Niagara Falls', 'Cambridge']\r\n",
"genre : \r\n",
"movement : \r\n"
"{\"family name\": \"Van der Heyden\", \"given name\": \"Jan\", \"pseudonym\": \"\", \"sex or gender\": \"male\", \"date of birth\": \"1637-03-05\", \"place of birth\": \"Gorinchem\", \"latitude of place of birth\": \"51.83652\", \"longitude of place of birth\": \"4.97243\", \"date of death\": \"1712-03-28\", \"place of death\": \"Amsterdam\", \"latitude of place of death\": \"52.372777777\", \"longitude of place of death\": \"4.893611111\", \"country of citizenship\": \"Netherlands\", \"residence\": \"\", \"work location\": \"Amsterdam\", \"genre\": \"landscape art\", \"movement\": \"\", \"occupation\": [\"firefighter\", \"inventor\", \"painter\", \"instrument maker\", \"printmaker\"]}"
]
}
],
"source": [
"!cat ./data/William_Notman/metadata.txt"
"!cat ./data/Jan_Van_Der_Heyden/metadata.json"
]
},
{
Expand All @@ -246,23 +230,12 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "54afc420",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bald-eagle-white-headed-eagle-william-notman_oQGugt5O8az3bA\r\n",
"montreal-from-street-railway-power-house-chimney-qc__AHTyBmsOEhjaw\r\n",
"mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w\r\n",
"the-terra-nova-snowshoe-club-montreal-notman-william-sandham-henry_OQFIdhZoZj9eOg\r\n"
]
}
],
"outputs": [],
"source": [
"!ls ./data/William_Notman/works"
"!ls ./data/Jan_Van_Der_Heyden/works"
]
},
{
Expand All @@ -288,7 +261,7 @@
}
],
"source": [
"!ls ./data/William_Notman/works/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w/"
"!ls ./data/Jan_Van_Der_Heyden/works/country-house-on-the-vliet-near-delft-jan-van-de-heyden_3wEgj7D5Ld8nvg"
]
},
{
Expand All @@ -313,11 +286,11 @@
{
"cell_type": "code",
"execution_count": null,
"id": "8f4e1e93",
"id": "046657bf",
"metadata": {},
"outputs": [],
"source": [
"img = mpimg.imread(\"./data/William_Notman/works/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w/artwork.png\")\n",
"img = mpimg.imread(\"./data/Jan_Van_Der_Heyden/works/country-house-on-the-vliet-near-delft-jan-van-de-heyden_3wEgj7D5Ld8nvg/artwork.png\")\n",
"plt.imshow(img)\n",
"plt.show()"
]
Expand All @@ -340,12 +313,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{\"main_text\": \"\", \"title\": \"Mrs. William MacKenzie in Allan's conservatory, Montreal, QC\", \"date\": \"1871/1871\", \"type\": \"Photographie, Photograph\", \"titre\": \"Mme William MacKenzie, Montr\\u00e9al, QC\", \"photographer\": \"William Notman\", \"credit line\": \"Purchase from Associated Screen News Ltd., Achat de l'Associated Screen News Ltd.\", \"rights\": \"http://www.musee-mccord.qc.ca/en/orders/conditions/\", \"external link\": \" http://www.mccord-museum.qc.ca/en/collection/artifacts/I-63833\", \"medium\": \"Silver salts on glass - Wet collodion process, Plaque de verre au collodion humide\", \"id\": \"GQEUtJuLAhf54w\", \"link\": \"https://artsandculture.google.com/asset/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc/GQEUtJuLAhf54w\"}"
"{\"main_text\": \"The country house in the right middle ground has been identified as one which used to lie on the river Vliet, running between Delft and The Hague. Though this is possible, the house does not seem sufficiently distinctive to permit such a specific identification. This scene, however, depicts a fashionable part of Holland in the seventeenth century: a navigable canal or river with a well-kept towpath and a considerable volume of freight traffic. Lining the water are houses with plots of land extending into the flat, low-lying, fertile, reclaimed land. There is an alternation of elegant farmhouses, like the one with a stepped gable and hayrick, and buitenplaatsen (country houses), like the one nearer to us, with its ionic pilasters and dormer windows with scroll surrounds (as opposed to the more traditional gables). This house has a stone gate and a topiary hedge with claire-vues and an avenue of trees. Audrey Lambert reproduces a 1770 map of Rijswijk, between Delft and The Hague, which still shows exactly this alternation of simple plots and formal gardens extending into the polders on either side of the Vliet and nearby roads. This image by Heyden (1637-1712) is notable for its restrained depiction of evening light, with more white than gold in the spectrum and just a hint of pink in some of the clouds. But it is the vivid naturalism of the scene, with its matter-of-fact viewpoint, recording a public thoroughfare with no deference to the country house, which so remarkably anticipates the landscapes of the Impressionists. It is also possible that Constable had seen this painting when he painted his Scene on a Navigable River in 1816-17 (Tate, London), with its sparkling pointillist touch and scrupulous record of a working inland waterway.\", \"title\": \"Country House on the Vliet near Delft\", \"creator\": \"Jan van de Heyden\", \"creator lifespan\": \"1637 - 1712\", \"date created\": \"1665\", \"type\": \"Painting\", \"rights\": \"Supplied by Royal Collection Trust / (c) HM Queen Elizabeth II 2012\", \"external link\": \" http://www.rct.uk/collection/405948\", \"medium\": \"Oil on panel\", \"provenance\": \"Acquired by George IV when Prince Regent, 1814\", \"object description\": \"Beside a canal runs a road on which a huntsman walks his dog, with a country house & an outbuilding on the right; a mother and her children are seated by the road; in the centre a barge is moored to a landing-stage.\", \"id\": \"3wEgj7D5Ld8nvg\", \"link\": \"https://artsandculture.google.com/asset/country-house-on-the-vliet-near-delft-jan-van-de-heyden/3wEgj7D5Ld8nvg\"}"
]
}
],
"source": [
"!cat ./data/William_Notman/works/mrs-william-mackenzie-in-allan-s-conservatory-montreal-qc_GQEUtJuLAhf54w//metadata.json"
"!cat ./data/Jan_Van_Der_Heyden/works/country-house-on-the-vliet-near-delft-jan-van-de-heyden_3wEgj7D5Ld8nvg/metadata.json"
]
}
],
Expand Down
Loading

0 comments on commit fc13ed6

Please sign in to comment.