-
-
Notifications
You must be signed in to change notification settings - Fork 291
/
_02_searchable_soup.py
115 lines (94 loc) · 4.42 KB
/
_02_searchable_soup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""
# TODO: Read the TODOs carefully and remove all existing comments in this file.
This is a sample using the SearchableSoupTemplate as the template. This template
provides a wrapper around the GeneralSoupTemplate to support search.
Put your source file inside the language folder. The `en` folder has too many
files, therefore it is grouped using the first letter of the domain name.
"""
import logging
from typing import Generator, Union
from bs4 import BeautifulSoup, Tag
from lncrawl.models import Chapter, SearchResult, Volume
from lncrawl.templates.soup.searchable import SearchableSoupTemplate
logger = logging.getLogger(__name__)
# TODO: You can safely delete all [OPTIONAL] methods if you do not need them.
class MyCrawlerName(SearchableSoupTemplate):
# TODO: [REQUIRED] Provide the URLs supported by this crawler.
base_url = ["http://sample.url/"]
# TODO: [OPTIONAL] Set True if this crawler is for manga/manhua/manhwa.
has_manga = False
# TODO: [OPTIONAL] Set True if this source contains machine translations.
has_mtl = False
# TODO: [OPTIONAL] This is called before all other methods.
def initialize(self) -> None:
# You can customize `TextCleaner` and other necessary things.
pass
# TODO: [OPTIONAL] This is called once per session before searching and fetching novel info.
def login(self, username_or_email: str, password_or_token: str) -> None:
# Examples:
# - https://github.com/dipu-bd/lightnovel-crawler/blob/master/sources/multi/mtlnovel.py
# - https://github.com/dipu-bd/lightnovel-crawler/blob/master/sources/multi/ranobes.py
pass
# TODO: [OPTIONAL] If it is necessary to logout after session is finished, you can implement this.
def logout(self):
pass
# TODO: [REQUIRED] Select novel items found in search page from the query
def select_search_items(self, query: str) -> Generator[Tag, None, None]:
# The query here is the input from user.
#
# Example:
# params = {"searchkey": query}
# soup = self.post_soup(f"{self.home_url}search?{urlencode(params)}")
# yield from soup.select(".col-content .con .txt h3 a")
pass
# TODO: [REQUIRED] Parse a tag and return single search result
def parse_search_item(self, tag: Tag) -> SearchResult:
# The tag here comes from self.select_search_items
#
# Example:
# return SearchResult(
# title=tag.text.strip(),
# url=self.absolute_url(tag["href"]),
# )
pass
# TODO: [OPTIONAL] Get a BeautifulSoup instance from the self.novel_url
def get_novel_soup(self) -> BeautifulSoup:
return self.get_soup(self.novel_url)
# TODO: [REQUIRED] Parse and return the novel title
def parse_title(self, soup: BeautifulSoup) -> str:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [REQUIRED] Parse and return the novel cover
def parse_cover(self, soup: BeautifulSoup) -> str:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [REQUIRED] Parse and return the novel authors
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
# The soup here is the result of `self.get_soup(self.novel_url)`
#
# Example 1: <a single author example>
# tag = soup.find("strong", string="Author:")
# assert tag
# yield tag.next_sibling.text.strip()
#
# Example 2: <multiple authors example>
# for a in soup.select(".m-imgtxt a[href*='/authors/']"):
# yield a.text.strip()
pass
# TODO: [REQUIRED] Parse and set the volumes and chapters
def parse_chapter_list(
self, soup: BeautifulSoup
) -> Generator[Union[Chapter, Volume], None, None]:
# The soup here is the result of `self.get_soup(self.novel_url)`
pass
# TODO: [REQUIRED] Select the tag containing the chapter text
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
# The soup here is the result of `self.get_soup(chapter.url)`
#
# Example: return soup.select_one(".m-read .txt")
pass
# TODO: [OPTIONAL] Return the index in self.chapters which contains a chapter URL
def index_of_chapter(self, url: str) -> int:
# To get more help, check the default implemention in the `Crawler` class.
pass