-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
126 lines (100 loc) · 4.17 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import hashlib
import json
import logging
from typing import List, Tuple
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import urllib.parse
import re
def scrape_google_and_links(query: str, max_links: int = 5) -> List[str]:
"""
Scrapes Google search results and linked pages for a given query. Saves image URLs.
Args:
query (str): The search query to use.
max_links (int): Maximum number of links to scrape. Defaults to 5.
Returns:
list: A list of scraped content from the links.
"""
image_urls = []
scraped_contents = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
try:
# Perform Google search
encoded_query = urllib.parse.quote(query)
google_url = f"https://www.google.com/search?q={encoded_query}"
page.goto(google_url)
logging.info(f"Searching Google for: {query}")
# Scrape search results
content = page.content()
soup = BeautifulSoup(content, 'html.parser')
search_results = soup.find_all('div', class_='yuRUbf')
links = [result.find('a')['href'] for result in search_results]
# Create a directory to store HTML files
storage_dir = "scraped_html"
os.makedirs(storage_dir, exist_ok=True)
# Scrape content from each link
for link in links[:max_links]:
scraped_content, link_image_urls = scrape_single_link(page, link, storage_dir)
if scraped_content:
scraped_contents.append(scraped_content)
image_urls.extend(link_image_urls)
finally:
browser.close()
# Save image URLs to a file
with open('image_urls.json', 'w') as f:
json.dump(image_urls, f)
return scraped_contents
def scrape_single_link(page, link: str, storage_dir: str) -> Tuple[str, List[str]]:
"""Scrapes content and image URLs from a single link."""
try:
page.goto(link, timeout=10000) # 10 seconds timeout
content = page.content()
# Generate a unique filename based on the link
filename = hashlib.md5(link.encode()).hexdigest() + ".html"
filepath = os.path.join(storage_dir, filename)
# Save the HTML content
with open(filepath, "w", encoding="utf-8") as f:
f.write(content)
soup = BeautifulSoup(content, 'html.parser')
# Extract text content
text_content = soup.get_text(separator=' ', strip=True)
# Extract image URLs
image_urls = extract_image_urls(page, link)
return text_content, image_urls
except Exception as e:
logging.error(f"Error scraping {link}: {str(e)}")
return None, []
def extract_image_urls(page, base_url: str) -> List[str]:
"""Extracts image URLs from the current page."""
image_urls = []
for img in page.query_selector_all('img'):
src = img.get_attribute('src')
if src:
# Handle relative URLs
if not src.startswith(('http://', 'https://')):
src = urllib.parse.urljoin(base_url, src)
# Include more image formats and don't filter by extension
image_urls.append(src)
# Extract background images from inline styles
elements_with_bg = page.query_selector_all('[style*="background-image"]')
for element in elements_with_bg:
style = element.get_attribute('style')
url_match = re.search(r'url\(["\']?(.*?)["\']?\)', style)
if url_match:
bg_url = url_match.group(1)
if not bg_url.startswith(('http://', 'https://')):
bg_url = urllib.parse.urljoin(base_url, bg_url)
image_urls.append(bg_url)
return list(set(image_urls)) # Remove duplicates
def get_image_urls() -> List[str]:
"""
Retrieves image URLs from a JSON file.
Returns:
list: A list of image URLs.
"""
logging.info("Getting image URLs")
with open('image_urls.json', 'r') as f:
return json.load(f)