Skip to content

Commit

Permalink
Merge pull request #2277 from kuwoyuki/reaperscans
Browse files Browse the repository at this point in the history
fix: reaperscans
  • Loading branch information
dipu-bd authored Feb 25, 2024
2 parents 6e47fd6 + 51fb83c commit abc97fa
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 57 deletions.
12 changes: 7 additions & 5 deletions lncrawl/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,11 @@ def submit_form(
headers = CaseInsensitiveDict(headers)
headers.setdefault(
"Content-Type",
"multipart/form-data"
if multipart
else "application/x-www-form-urlencoded; charset=UTF-8",
(
"multipart/form-data"
if multipart
else "application/x-www-form-urlencoded; charset=UTF-8"
),
)
return self.post_response(url, data=data, headers=headers, **kwargs)

Expand Down Expand Up @@ -269,15 +271,15 @@ def get_json(self, url, headers={}, **kwargs) -> Any:
response = self.get_response(url, headers=headers, **kwargs)
return response.json()

def post_json(self, url, data={}, headers={}) -> Any:
def post_json(self, url, data={}, headers={}, **kwargs) -> Any:
"""Make a POST request and return the content as JSON object"""
headers = CaseInsensitiveDict(headers)
headers.setdefault("Content-Type", "application/json")
headers.setdefault(
"Accept",
"application/json,text/plain,*/*",
)
response = self.post_response(url, data=data, headers=headers)
response = self.post_response(url, data=data, headers=headers, **kwargs)
return response.json()

def submit_form_json(
Expand Down
123 changes: 71 additions & 52 deletions sources/en/r/reaperscans.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
import json
import logging
from bs4 import Tag

from lncrawl.core.crawler import Crawler

logger = logging.getLogger(__name__)
search_url = "https://reaperscans.com/?s=%s&post_type=wp-manga"


class Reaperscans(Crawler):
Expand All @@ -22,72 +22,91 @@ def initialize(self):
"https://discord.gg/MaRegMFhRb",
"https://discord.gg/reapercomics",
"h ttps://discord.gg/reapercomic",
"https://discord.gg/sb2jqkv",
"____",
"Join our Discord for updates on releases!",
"Join our Discord",
]
)
self.init_executor(ratelimit=0.9)

def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)

results = []
for tab in soup.select(".c-tabs-item__content"):
a = tab.select_one(".post-title h3 a")
latest = tab.select_one(".latest-chap .chapter a").text
votes = tab.select_one(".rating .total_votes").text
results.append(
{
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
"info": "%s | Rating: %s" % (latest, votes),
}
)

return results
def get_chapters_from_page(self, page, body):
url = self.absolute_url("/livewire/message/" + body["fingerprint"]["name"])
body["updates"] = [
{
"type": "callMethod",
"payload": {
"id": "00000",
"method": "gotoPage",
"params": [page, "page"],
},
}
]

response = self.post_json(url=url, data=json.dumps(body), timeout=10)
return self.make_soup(response["effects"]["html"])

def get_chapters_from_doc(self, dom):
return [
{
"title": a.select_one("p").text.strip(),
"url": self.absolute_url(a["href"]),
}
for a in dom.select("div[wire\\3A id] ul[role] li a")
]

def insert_chapters(self, chapters):
self.chapters = [
{
"id": i + 1,
"title": x["title"],
"url": x["url"],
}
for i, x in enumerate(reversed(chapters))
]

def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one(".post-title h1")
assert isinstance(possible_title, Tag)
for span in possible_title.select("span"):
span.extract()
self.novel_title = possible_title.text.strip()
self.novel_title = soup.select_one("h1").text.strip()
logger.info("Novel title: %s", self.novel_title)

possible_image = soup.select_one(".summary_image a img")
possible_image = soup.select_one(".h-full .w-full img")
if isinstance(possible_image, Tag):
self.novel_cover = self.absolute_url(possible_image["data-src"])
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)

self.novel_author = " ".join(
[a.text.strip() for a in soup.select('.author-content a[href*="author"]')]
# livewire container
container = soup.select_one("main div[wire\\:id][wire\\:initial-data]")
# first page ssr json
body = json.loads(container["wire:initial-data"])
body.pop("effects")
# initial chapters from soup
chapters = self.get_chapters_from_doc(container)
page_count = 1
last_page = container.select_one(
'span[wire\\:key^="paginator-page"]:nth-last-child(2)'
)
logger.info("%s", self.novel_author)

chapter_list_url = self.absolute_url("ajax/chapters", self.novel_url)
soup = self.post_soup(chapter_list_url, headers={"accept": "*/*"})
for a in reversed(
soup.select('.wp-manga-chapter:not(.premium-block) a[href*="/chapter"]')
): # This stops it from trying to download locked premium chapters.
for span in a.findAll("span"): # Removes time and date from chapter title.
span.extract()
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if chap_id % 100 == 1:
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)

if isinstance(last_page, Tag):
page_count = int(last_page.text.strip())
else:
self.insert_chapters(chapters)
# if we don't have the pagination el
return

toc_futures = [
self.executor.submit(self.get_chapters_from_page, k, body)
for k in range(2, page_count + 1)
]
self.resolve_futures(toc_futures, desc="TOC", unit="page")
for f in toc_futures:
chapters.extend(self.get_chapters_from_doc(f.result()))

self.insert_chapters(chapters)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
contents = soup.select_one("div.text-left")
# TODO: better retry/timeout settings
soup = self.get_soup(chapter["url"], retry=3, timeout=10)
contents = soup.select_one("article")
return self.cleaner.extract_contents(contents)

0 comments on commit abc97fa

Please sign in to comment.