-
-
Notifications
You must be signed in to change notification settings - Fork 291
/
relibrary.py
58 lines (46 loc) · 1.7 KB
/
relibrary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import logging
from typing import Generator
from bs4 import BeautifulSoup, Tag
from lncrawl.models import Chapter
from lncrawl.templates.soup.chapter_only import ChapterOnlySoupTemplate
logger = logging.getLogger(__name__)
class relibCrawler(ChapterOnlySoupTemplate):
base_url = [
"https://re-library.com/",
]
def initialize(self) -> None:
self.init_executor(1)
self.cleaner.bad_css.update(
[
"tr",
".nextPageLink",
".prevPageLink",
".su-button",
"a[href*='re-library.com']",
]
)
self.cleaner.bad_tag_text_pairs.update(
{
"h2": "References",
}
)
def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".entry-title")
return tag.text.strip()
def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".entry-content table img")
src = tag.get("data-src") or tag.get("src")
return self.absolute_url(src)
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
for a in soup.select_one(".entry-content").select("a[href*='/nauthor/']"):
yield a.text.strip()
def select_chapter_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]:
yield from soup.select(".page_item > a")
def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
return Chapter(
id=id,
title=tag.text.strip(),
url=self.absolute_url(tag["href"]),
)
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one(".entry-content")