-
-
Notifications
You must be signed in to change notification settings - Fork 291
/
_14_with_volume_browser.py
167 lines (138 loc) · 6.37 KB
/
_14_with_volume_browser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# -*- coding: utf-8 -*-
"""
# TODO: Read the TODOs carefully and remove all existing comments in this file.
This is a sample using the ChapterWithVolumeBrowserTemplate as the template.
It provides a wrapper around the GeneralBrowserTemplate that generates both
volumes list and chapter list.
Put your source file inside the language folder. The `en` folder has too many
files, therefore it is grouped using the first letter of the domain name.
"""
import logging
from typing import Generator
from bs4 import BeautifulSoup, Tag
from lncrawl.models import Chapter, Volume
from lncrawl.templates.browser.with_volume import ChapterWithVolumeBrowserTemplate
logger = logging.getLogger(__name__)
# TODO: You can safely delete all [OPTIONAL] methods if you do not need them.
class MyCrawlerName(ChapterWithVolumeBrowserTemplate):
# TODO: [REQUIRED] Provide the URLs supported by this crawler.
base_url = ["http://sample.url/"]
# TODO: [OPTIONAL] Set True if this crawler is for manga/manhua/manhwa.
has_manga = False
# TODO: [OPTIONAL] Set True if this source contains machine translations.
has_mtl = False
# TODO: [OPTIONAL] This is called before all other methods.
def initialize(self) -> None:
# You can customize `TextCleaner` and other necessary things.
pass
# TODO: [OPTIONAL] Open the Novel URL in the browser
def visit_novel_page_in_browser(self) -> BeautifulSoup:
# self.visit(self.novel_url)
pass
# TODO: [OPTIONAL] Parse and return the novel title in the browser
def parse_title_in_browser(self) -> str:
# return self.parse_title(self.browser.soup)
pass
# TODO: [REQUIRED] Parse and return the novel title
def parse_title(self, soup: BeautifulSoup) -> str:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [OPTIONAL] Parse and return the novel cover image in the browser
def parse_cover_in_browser(self) -> str:
# return self.parse_cover(self.browser.soup)
pass
# TODO: [REQUIRED] Parse and return the novel cover
def parse_cover(self, soup: BeautifulSoup) -> str:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [OPTIONAL] Parse and return the novel author in the browser
def parse_authors_in_browser(self) -> Generator[Tag, None, None]:
# yield from self.parse_authors(self.browser.soup)
pass
# TODO: [REQUIRED] Parse and return the novel authors
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
# The soup here is the result of `self.get_soup(self.novel_url)`
#
# Example 1: <a single author example>
# tag = soup.find("strong", string="Author:")
# assert tag
# yield tag.next_sibling.text.strip()
#
# Example 2: <multiple authors example>
# for a in soup.select(".m-imgtxt a[href*='/authors/']"):
# yield a.text.strip()
pass
# TODO: [OPTIONAL] Open the Chapter URL in the browser
def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
# self.visit(chapter.url)
pass
# TODO: [REQUIRED] Select volume list item tags from the page soup
def select_volume_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]:
# The soup here is the result of `self.get_soup(self.novel_url)`
#
# Example: yield from soup.select("#toc .vol-item")
pass
# TODO: [OPTIONAL] Select volume list item tags from the browser
def select_volume_tags_in_browser(self) -> Generator[Tag, None, None]:
# return self.select_volume_tags(self.browser.soup)
pass
# TODO: [REQUIRED] Parse a single volume from volume list item tag
def parse_volume_item(self, tag: Tag, id: int) -> Volume:
# The tag here comes from `self.select_volume_tags`
# The id here is the next available volume id
#
# Example:
# return Volume(
# id=id,
# title= tag.text.strip(),
# )
pass
# TODO: [OPTIONAL] Parse a single volume from volume list item tag when using browser
def parse_volume_item_in_browser(self, tag: Tag, id: int) -> Volume:
# return self.parse_volume_item(tag, id)
pass
# TODO: [REQUIRED] Select chapter list item tags from volume tag and page soup
def select_chapter_tags(self, tag: Tag, vol: Volume) -> Generator[Tag, None, None]:
# The tag here comes from `self.select_volume_tags`
# The vol here comes from `self.parse_volume_item`
#
# Example: yield from tag.select(".chapter-item")
pass
# TODO: [OPTIONAL] Select chapter list item tags from volume tag and page soup when in browser
def select_chapter_tags_in_browser(
self, tag: Tag, vol: Volume
) -> Generator[Tag, None, None]:
# raise self.select_chapter_tags(tag, vol)
pass
# TODO: [REQUIRED] Parse a single chapter from chapter list item tag
def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter:
# The tag here comes from `self.select_chapter_tags`
# The vol here comes from `self.parse_volume_item`
# The id here is the next available chapter id
#
# Example:
# return Chapter(
# id=id,
# volume=vol.id,
# title=tag.text.strip(),
# url=self.absolute_url(tag["href"]),
# )
pass
# TODO: [OPTIONAL] Parse a single chapter from chapter list item tag when in browser
def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
# raise self.parse_chapter_item(tag, id, vol)
pass
# TODO: [OPTIONAL] Select the tag containing the chapter text in the browser
def select_chapter_body_in_browser(self) -> Tag:
# return self.select_chapter_body(self.browser.soup)
pass
# TODO: [REQUIRED] Select the tag containing the chapter text
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
# The soup here is the result of `self.get_soup(chapter.url)`
#
# Example: return soup.select_one(".m-read .txt")
pass
# TODO: [OPTIONAL] Return the index in self.chapters which contains a chapter URL
def index_of_chapter(self, url: str) -> int:
# To get more help, check the default implemention in the `Crawler` class.
pass