From 015be61b2cd2a6784acec9fd1b4c5a2c144738a1 Mon Sep 17 00:00:00 2001 From: ACA Date: Fri, 9 Feb 2024 21:54:59 +0100 Subject: [PATCH 1/2] UukanshuOnline: fix URL & rename file --- sources/zh/{uukanshu.py => uukanshu_sj.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename sources/zh/{uukanshu.py => uukanshu_sj.py} (97%) diff --git a/sources/zh/uukanshu.py b/sources/zh/uukanshu_sj.py similarity index 97% rename from sources/zh/uukanshu.py rename to sources/zh/uukanshu_sj.py index e4caae364..d930daae3 100644 --- a/sources/zh/uukanshu.py +++ b/sources/zh/uukanshu_sj.py @@ -12,7 +12,7 @@ class UukanshuOnline(Crawler): - base_url = ["https://sj.uukanshu.com/"] + base_url = ["https://sj.uukanshu.net/"] # previously .com, redirects .com to .net though def search_novel(self, query): query = query.lower().replace(" ", "+") From 5874aa948d7d2bd5f48722a5775972cc3c0b5b7a Mon Sep 17 00:00:00 2001 From: ACA Date: Fri, 9 Feb 2024 23:42:47 +0100 Subject: [PATCH 2/2] UukanshuOnline: add support for www and tw subdomains (traditional & simplified cn) uukanshu_sj: rename class & make format_text a staticmethod --- sources/zh/uukanshu.py | 74 +++++++++++++++++++++++++++++++++++++++ sources/zh/uukanshu_sj.py | 5 +-- 2 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 sources/zh/uukanshu.py diff --git a/sources/zh/uukanshu.py b/sources/zh/uukanshu.py new file mode 100644 index 000000000..9b00acc5d --- /dev/null +++ b/sources/zh/uukanshu.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +import logging + +from bs4 import Tag + +from lncrawl.core.crawler import Crawler +from lncrawl.models import Chapter, Volume +from sources.zh.uukanshu_sj import UukanshuOnlineSJ + +logger = logging.getLogger(__name__) + +novel_search_url = "%ssearch.aspx?k=%s" + + +class UukanshuOnline(Crawler): + # www is simplified cn, tw is traditional cn but both use same site structure + base_url = ["https://www.uukanshu.net/", "https://tw.uukanshu.net/"] + + encoding = "gbk" + + def initialize(self): + # the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters) + self.init_parser("html.parser") + + def read_novel_info(self) -> None: + # the encoding for tw is utf-8, for www. is gbk -> otherwise output is messed up with wrong symbols. + if "tw." in self.novel_url: + self.encoding = "utf-8" + + soup = self.get_soup(self.novel_url, encoding=self.encoding) + info = soup.select_one("dl.jieshao") + assert info # if this fails, HTML structure has fundamentally changed -> needs update + meta = info.select_one("dd.jieshao_content") + + img = info.select_one("dt.jieshao-img img") + if img: + self.novel_cover = self.absolute_url(img["src"]) + + self.novel_title = meta.select_one("h1 > a").text + self.novel_author = meta.select_one("h2 > a").text + self.novel_synopsis = meta.select_one("h3 > p").text + + chapters = soup.select_one("ul#chapterList") + for chapter in list(chapters.children)[::-1]: # reverse order as it's newest to oldest + # convince typehint that we're looking at Tags & also make sure we skip random text within the ul if any + if not isinstance(chapter, Tag): + continue + # find chapters + if chapter.has_attr("class") and "volume" in chapter["class"]: + self.volumes.append( + Volume( + id=len(self.volumes) + 1, + title=chapter.text.strip(), + ) + ) + continue + anchor = chapter.select_one("a") + if not anchor: + logger.warning("Found
  • in chapter list, not volume, without link: %s", chapter) + continue + self.chapters.append( + Chapter( + id=len(self.chapters) + 1, + url=self.absolute_url(anchor["href"]), + title=anchor.text, + volume=len(self.volumes), + ) + ) + + def download_chapter_body(self, chapter: Chapter) -> str: + soup = self.get_soup(chapter.url, encoding=self.encoding) + content = soup.select_one("div#contentbox") + # use same filters as already implemented on essentially same site + return UukanshuOnlineSJ.format_text(self.cleaner.extract_contents(content)) diff --git a/sources/zh/uukanshu_sj.py b/sources/zh/uukanshu_sj.py index d930daae3..159d7326c 100644 --- a/sources/zh/uukanshu_sj.py +++ b/sources/zh/uukanshu_sj.py @@ -11,7 +11,7 @@ chapter_list_url = "%s&page=%d" -class UukanshuOnline(Crawler): +class UukanshuOnlineSJ(Crawler): base_url = ["https://sj.uukanshu.net/"] # previously .com, redirects .com to .net though def search_novel(self, query): @@ -88,7 +88,8 @@ def download_chapter_body(self, chapter): return self.format_text(content) - def format_text(self, text): + @staticmethod + def format_text(text): text = re.sub( r"[UU][UU]\s*看书\s*[ww][ww][ww][\..][uu][uu][kk][aa][nn][ss][hh][uu][\..][cc][oo][mm]", "",