-
Notifications
You must be signed in to change notification settings - Fork 0
/
pictures.py
146 lines (125 loc) · 4.88 KB
/
pictures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 22 14:28:00 2021
@author: rimes
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import requests, os, bs4
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
from selenium.webdriver.common.action_chains import ActionChains
import time
from tkinter import Tk
from InputWindow import UserInputWindow
"Code gotten from https://www.thepythoncode.com/article/download-web-page-images-python"
def getUserInput(title, prompt):
root = Tk()
root.geometry("250x150+300+300")
app = UserInputWindow(title, prompt)
root.mainloop()
return app.getValue()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
#if not a valid url these two will be empty returning false
return bool(parsed.netloc) and bool(parsed.scheme)
def get_img(url):
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.content, 'html.parser')
img = soup.find("img")
img_url = img.attrs.get("src")
#if we can't find it's src then we don't get the image
if not img_url:
return
#merges with the base url
img_url = urljoin(url, img_url)
#gets ride of the "HTTP GET" key value pairs
try:
pos = img_url.index("?")
img_url = img_url[:pos]
except ValueError:
pass
#make sure it is a value url
if is_valid(img_url):
return img_url
return
def download(url, pathname):
"""
Downloads a file given an URL and puts it in the folder `pathname`
"""
# if path doesn't exist, make that path dir
if not os.path.isdir(pathname):
os.makedirs(pathname)
# download the body of response by chunk, not immediately (stream downloads it in chunks)
response = requests.get(url, stream=True)
# get the total file size
file_size = int(response.headers.get("Content-Length", 0))
# get the file name
filename = os.path.join(pathname, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
with open(filename, "wb") as f:
for data in progress:
# write data read to the file
f.write(data)
# update the progress bar manually
progress.update(len(data))
def searchImages(browser, SearchTerm):
cont = True
while cont:
try:
searchIcon = browser.find_element_by_class_name("icon-search")
searchIcon.click()
search = WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.ID,"search-field")))
search.send_keys(SearchTerm)
search.send_keys(Keys.ENTER)
cont = False
except TimeoutException:
browser.refresh()
def getToFirstImg(browser):
WebDriverWait(browser,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,".view.photo-list-photo-view.awake")))
firstImg = browser.find_element_by_xpath("(//div[@class = 'photo-list-photo-interaction'])[1]")
action = ActionChains(browser)
browser.execute_script("window.scrollTo(0, 50)")
action.move_to_element(firstImg).click().perform()
WebDriverWait(browser,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.hide-text')))
def getXnumImg(browser, numPics):
urls = []
#creates a progress bar and finds all the images' urls
pbar = tqdm(total = 10 , desc = "Downloading images...")
i = 0
while i < numPics:
try:
WebDriverWait(browser,3).until(EC.presence_of_element_located((By.LINK_TEXT, "Sponsored Advertiser")))
time.sleep(8)
except TimeoutException:
imgURL = get_img(browser.current_url)
if imgURL != None:
urls.append(imgURL)
pbar.update(1)
i += 1
if i < 2:
nextImg = browser.find_element_by_class_name("hide-text")
else:
nextImg = browser.find_element_by_xpath("(//span[@class = 'hide-text'])[2]")
nextImg.click()
return urls
if __name__ == "__main__":
SearchTerm = getUserInput("Search Word", "Please enter what you want to search:")
numOfPictures = getUserInput("Number of Pictures", "Please enter the number of pictures you want:")
browser = webdriver.Chrome()
browser.get('https://flickr.com/')
searchImages(browser,SearchTerm)
getToFirstImg(browser)
urls = getXnumImg(browser, int(numOfPictures))
browser.close()
for link in urls:
download(link, "pictures")