-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scraper.py
102 lines (90 loc) · 3.97 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException
import os
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
RESET = "\033[0m"
class WebScraper:
def __init__(self, url, depth, extensions=None):
self.url = url
self.depth = depth
self.extensions = extensions if extensions else []
self.visited_urls = set()
self.downloaded_files = set()
def is_valid_url(self, url):
return (
urlparse(url).netloc == urlparse(self.url).netloc
and url not in self.visited_urls
)
def get_page_content(self, url):
try:
response = requests.get(url)
response.raise_for_status()
return response.content
except HTTPError as e:
print(f"{RED}HTTP error occurred: {e}{RESET}")
except ConnectionError as e:
print(f"{RED}Connection error occurred: can't connect to {url}{RESET}")
except Timeout as e:
print(f"{RED}Timeout error occurred: {e}{RESET}")
except RequestException as e:
print(f"{RED}Unexpected error occurred: {e}{RESET}")
return None
def download_files(self, urls, download_dir="results"):
download_dir = os.path.join(download_dir, urlparse(self.url).netloc)
os.makedirs(download_dir, exist_ok=True)
for url in urls:
filename = os.path.basename(urlparse(url).path)
file_path = os.path.join(download_dir, filename)
try:
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(file_path, "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"{GREEN}Downloaded file: {url}{RESET}")
except requests.exceptions.HTTPError as e:
print(f"{RED}HTTP error occurred: {e}{RESET}")
def extract_files(self, soup, url):
links_attributes = ["href", "src"]
for tag in soup.find_all(True, href=True) + soup.find_all(True, src=True):
for attribute in links_attributes:
if tag.has_attr(attribute):
link = tag[attribute]
parsed_link = urlparse(link)
if "." in os.path.basename(parsed_link.path):
if not self.extensions or any(
[link.endswith(ext) for ext in self.extensions]
):
file_url = urljoin(url, link)
if file_url not in self.downloaded_files:
self.downloaded_files.add(file_url)
def crawl_to_links(self, soup, url, current_depth):
for tag in soup.find_all("a", href=True):
link = urljoin(url, tag["href"])
if self.is_valid_url(link):
self.scrape_page(link, current_depth + 1)
def scrape_page(self, url, current_depth):
if current_depth > self.depth or not self.is_valid_url(url):
return
self.visited_urls.add(url)
content = self.get_page_content(url)
if content is None:
return
soup = BeautifulSoup(content, "html.parser")
self.extract_files(soup, url)
self.crawl_to_links(soup, url, current_depth)
def start_scraping(self):
self.scrape_page(self.url, current_depth=1)
if len(self.downloaded_files) > 0:
print(f"{YELLOW}Discovered {len(self.downloaded_files)} files.{RESET}")
userRes = input("Do you want to download them? (y/n): ").strip().lower()
if userRes == "y":
self.download_files(self.downloaded_files)
else:
print(f"{RED}Download aborted.{RESET}")
else:
print(f"{YELLOW}No files found.{RESET}")