-
-
Notifications
You must be signed in to change notification settings - Fork 291
/
_04_searchable_chapter_only_soup.py
128 lines (106 loc) · 4.98 KB
/
_04_searchable_chapter_only_soup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf-8 -*-
"""
# TODO: Read the TODOs carefully and remove all existing comments in this file.
This is a sample using the SearchableSoupTemplate and ChapterOnlySoupTemplate as the template.
It should be able to do searching and generating only chapter list excluding volumes list.
Put your source file inside the language folder. The `en` folder has too many
files, therefore it is grouped using the first letter of the domain name.
"""
import logging
from typing import Generator
from bs4 import BeautifulSoup, Tag
from lncrawl.models import Chapter, SearchResult
from lncrawl.templates.soup.chapter_only import ChapterOnlySoupTemplate
from lncrawl.templates.soup.searchable import SearchableSoupTemplate
logger = logging.getLogger(__name__)
# TODO: You can safely delete all [OPTIONAL] methods if you do not need them.
class MyCrawlerName(SearchableSoupTemplate, ChapterOnlySoupTemplate):
# TODO: [REQUIRED] Provide the URLs supported by this crawler.
base_url = ["http://sample.url/"]
# TODO: [OPTIONAL] Set True if this crawler is for manga/manhua/manhwa.
has_manga = False
# TODO: [OPTIONAL] Set True if this source contains machine translations.
has_mtl = False
# TODO: [OPTIONAL] This is called before all other methods.
def initialize(self) -> None:
# You can customize `TextCleaner` and other necessary things.
pass
# TODO: [OPTIONAL] This is called once per session before searching and fetching novel info.
def login(self, username_or_email: str, password_or_token: str) -> None:
# Examples:
# - https://github.com/dipu-bd/lightnovel-crawler/blob/master/sources/multi/mtlnovel.py
# - https://github.com/dipu-bd/lightnovel-crawler/blob/master/sources/multi/ranobes.py
pass
# TODO: [OPTIONAL] If it is necessary to logout after session is finished, you can implement this.
def logout(self):
pass
# TODO: [REQUIRED] Select novel items found in search page from the query
def select_search_items(self, query: str) -> Generator[Tag, None, None]:
# The query here is the input from user.
#
# Example:
# params = {"searchkey": query}
# soup = self.post_soup(f"{self.home_url}search?{urlencode(params)}")
# yield from soup.select(".col-content .con .txt h3 a")
pass
# TODO: [REQUIRED] Parse a tag and return single search result
def parse_search_item(self, tag: Tag) -> SearchResult:
# The tag here comes from self.select_search_items
#
# Example:
# return SearchResult(
# title=tag.text.strip(),
# url=self.absolute_url(tag["href"]),
# )
pass
# TODO: [OPTIONAL] Get a BeautifulSoup instance from the self.novel_url
def get_novel_soup(self) -> BeautifulSoup:
return self.get_soup(self.novel_url)
# TODO: [REQUIRED] Parse and return the novel title
def parse_title(self, soup: BeautifulSoup) -> str:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [REQUIRED] Parse and return the novel cover
def parse_cover(self, soup: BeautifulSoup) -> str:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [REQUIRED] Parse and return the novel authors
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
# The soup here is the result of `self.get_soup(self.novel_url)`
#
# Example 1: <a single author example>
# tag = soup.find("strong", string="Author:")
# assert tag
# yield tag.next_sibling.text.strip()
#
# Example 2: <multiple authors example>
# for a in soup.select(".m-imgtxt a[href*='/authors/']"):
# yield a.text.strip()
pass
# TODO: [REQUIRED] Select chapter list item tags from the page soup
def select_chapter_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]:
# The soup here is the result of `self.get_soup(self.novel_url)`
#
# Example: yield from soup.select(".m-newest2 li > a")
pass
# TODO: [REQUIRED] Parse a single chapter from chapter list item tag
def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
# The soup here is the result of `self.get_soup(self.novel_url)`
#
# Example:
# return Chapter(
# id=id,
# title=tag.text.strip(),
# url=self.absolute_url(tag["href"]),
# )
pass
# TODO: [REQUIRED] Select the tag containing the chapter text
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
# The soup here is the result of `self.get_soup(chapter.url)`
#
# Example: return soup.select_one(".m-read .txt")
pass
# TODO: [OPTIONAL] Return the index in self.chapters which contains a chapter URL
def index_of_chapter(self, url: str) -> int:
# To get more help, check the default implemention in the `Crawler` class.
pass