-
-
Notifications
You must be signed in to change notification settings - Fork 291
/
freewebnovel.py
106 lines (92 loc) · 4.14 KB
/
freewebnovel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
import unicodedata
from bs4 import BeautifulSoup, Tag
from lncrawl.models import Chapter, SearchResult
from lncrawl.templates.soup.chapter_only import ChapterOnlySoupTemplate
from lncrawl.templates.soup.searchable import SearchableSoupTemplate
class FreeWebNovelCrawler(SearchableSoupTemplate, ChapterOnlySoupTemplate):
base_url = [
"https://freewebnovel.com/",
"https://bednovel.com/",
"https://innread.com/",
"https://innnovel.com/",
"https://libread.com/"
]
def initialize(self) -> None:
self.init_executor(ratelimit=2)
self.cleaner.bad_tags.update(["h4", "sub"])
self.cleaner.bad_tag_text_pairs.update(
{
"p": [
r"freewebnovel\.com",
r"innread\.com",
r"bednovel\.com",
r"Updates by Freewebnovel\. com",
r"” Search Freewebnovel\.com\. on google”\.",
r"\/ Please Keep reading on MYFreeWebNovel\.C0M",
r"please keep reading on Freewebnovel\(dot\)C0M",
r"Continue\_reading on Freewebnovel\.com",
r"Continue \-reading on Freewebnovel\.com",
r"\/ Please Keep reading 0n FreewebNOVEL\.C0M",
r"\[ Follow current novels on Freewebnovel\.com \]",
r"‘Freewebnovel\.com\*’",
r"‘Search Freewebnovel\.com\, on google’",
r"‘ Search Freewebnovel\.com\(\) ‘",
r"“Freewebnovel\.com \.”",
r"“Please reading on Freewebnovel\.com\.”",
r"“Search Freewebnovel\.com\. on google”",
r"“Read more on Freewebnovel\.com\. org”",
r"Thank you for reading on FreeWebNovel\.me",
r"Please reading \-on Freewebnovel\.com",
r"”Search \(Freewebnovel\.com\(\) on google\”\?",
r"“Please reading on Freewebnovel\.com \:”",
r"”Please reading on Freewebnovel\.com\.”\?",
r"“Please reading on Freewebnovel\.com\>\; ”"
],
"i": [
r"\[ Follow current novels on Freewebnovel\.com \]"
]
}
)
def select_search_items(self, query: str):
data = {"searchkey": query}
soup = self.post_soup(f"{self.home_url}search/", data=data)
yield from soup.select(".col-content .con .txt h3 a")
def parse_search_item(self, tag: Tag) -> SearchResult:
return SearchResult(
title=tag.text.strip(),
url=self.absolute_url(tag["href"]),
)
def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".m-desc h1.tit")
assert isinstance(tag, Tag)
return tag.text.strip()
def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".m-imgtxt img")
assert isinstance(tag, Tag)
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])
if tag.has_attr("src"):
return self.absolute_url(tag["src"])
def parse_authors(self, soup: BeautifulSoup):
for a in soup.select(".m-imgtxt a[href*='/authors/']"):
yield a.text.strip()
def select_chapter_tags(self, soup: BeautifulSoup):
chapters = soup.select("#idData")
for chapter in chapters:
yield from chapter.select("li > a")
def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
return Chapter(
id=id,
url=self.absolute_url(tag["href"]),
title=tag.text.strip(),
)
def normalize_text(self, text: str) -> str:
return unicodedata.normalize("NFKC", text)
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
body_tag = soup.select_one(".m-read .txt")
if body_tag:
normalized_body = self.normalize_text(str(body_tag))
normalized_soup = BeautifulSoup(normalized_body, "html.parser")
return normalized_soup
return body_tag