-
Notifications
You must be signed in to change notification settings - Fork 0
/
booksToScrape.py
49 lines (37 loc) · 1.32 KB
/
booksToScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""This module scrapes all the http://books.toscrape.com website."""
import requests
from bs4 import BeautifulSoup
import os.path
import scrapeCategory
import sys
def main():
"""Scrapes the website homepage.
Get the path set by the user where the results will be saved.
Parse the homepage and retrieves links of all categories.
Calls the browse_pages function of the scrapCategory module.
"""
path_user_arg = sys.argv
csv_folder = os.getcwd() + '/csv'
if len(path_user_arg) == 2:
if os.path.isdir(path_user_arg[1]):
csv_folder = path_user_arg[1] + '/csv'
if not os.path.exists(csv_folder):
os.makedirs(csv_folder)
else:
print('Please enter a valid path folder')
exit()
else:
print('Please enter a path folder')
exit()
url = 'http://books.toscrape.com/'
response = requests.get(url)
if response.ok:
soup_ctg = BeautifulSoup(response.text, 'lxml')
menu_ctg = soup_ctg.findAll('ul', {'class': 'nav nav-list'})
categories = menu_ctg[0].find('ul').findAll('li')
for category in categories:
a = category.find('a')
link = url + a['href']
scrapeCategory.browse_pages(link[0:-10], csv_folder, url)
if __name__ == '__main__':
main()