-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
68 lines (51 loc) · 2.57 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
import os
import time
def save_chapter(title, paragraphs):
# Create a valid filename from the title
filename = f"{title}.txt".replace(" ", "_").replace("/", "_").replace("\\", "_")
# Write the paragraphs to the file
with open(filename, 'w', encoding='utf-8') as file:
file.write(f"Title: {title}\n\n")
for paragraph in paragraphs:
file.write(paragraph + "\n\n")
def scrape_paragraphs(url):
# Set up the Selenium web driver (Chrome in this case)
driver = webdriver.Chrome()
# Navigate to the URL
driver.get(url)
try:
while True:
# Find the element with the class 'chapter-text' and get its text as the title
title_element = driver.find_element(By.CLASS_NAME, 'chapter-text')
title = title_element.text
# Find the div with the id 'chapter-content'
chapter_content_div = driver.find_element(By.ID, 'chapter-content')
# Find all paragraph tags within this div
paragraphs = chapter_content_div.find_elements(By.TAG_NAME, 'p')
# Extract the text from each paragraph
paragraph_texts = [p.text for p in paragraphs]
# Save the chapter
save_chapter(title, paragraph_texts)
# Find the button with the id 'next_chap'
try:
next_button = driver.find_element(By.ID, 'next_chap')
# Check if the next button is disabled
if 'disabled' in next_button.get_attribute('class'):
print("Next chapter button is disabled. Stopping the loop.")
break
# Click the next button
next_button.click()
except (NoSuchElementException, ElementNotInteractableException):
print("Next chapter button is not found or not interactable. Stopping the loop.")
break
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Close the web driver
driver.quit()
# Replace 'your_website_url' with the actual URL of the website you want to scrape
scrape_paragraphs('https://novelfull.net/tales-of-the-reincarnated-lord/chapter-350-bank-entitlement-and-real-estate.html')