From 015be61b2cd2a6784acec9fd1b4c5a2c144738a1 Mon Sep 17 00:00:00 2001
From: ACA <aca-0021@proton.me>
Date: Fri, 9 Feb 2024 21:54:59 +0100
Subject: [PATCH 1/2] UukanshuOnline: fix URL & rename file

---
 sources/zh/{uukanshu.py => uukanshu_sj.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename sources/zh/{uukanshu.py => uukanshu_sj.py} (97%)
diff --git a/sources/zh/uukanshu.py b/sources/zh/uukanshu_sj.py
similarity index 97%
rename from sources/zh/uukanshu.py
rename to sources/zh/uukanshu_sj.py
index e4caae364..d930daae3 100644
--- a/sources/zh/uukanshu.py
+++ b/sources/zh/uukanshu_sj.py
@@ -12,7 +12,7 @@
 
 
 class UukanshuOnline(Crawler):
-    base_url = ["https://sj.uukanshu.com/"]
+    base_url = ["https://sj.uukanshu.net/"]  # previously .com, redirects .com to .net though
 
     def search_novel(self, query):
         query = query.lower().replace(" ", "+")

From 5874aa948d7d2bd5f48722a5775972cc3c0b5b7a Mon Sep 17 00:00:00 2001
From: ACA <aca-0021@proton.me>
Date: Fri, 9 Feb 2024 23:42:47 +0100
Subject: [PATCH 2/2] UukanshuOnline: add support for www and tw subdomains
 (traditional & simplified cn) uukanshu_sj: rename class & make format_text a
 staticmethod

---
 sources/zh/uukanshu.py    | 74 +++++++++++++++++++++++++++++++++++++++
 sources/zh/uukanshu_sj.py |  5 +--
 2 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 sources/zh/uukanshu.py

diff --git a/sources/zh/uukanshu.py b/sources/zh/uukanshu.py
new file mode 100644
index 000000000..9b00acc5d
--- /dev/null
+++ b/sources/zh/uukanshu.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+import logging
+
+from bs4 import Tag
+
+from lncrawl.core.crawler import Crawler
+from lncrawl.models import Chapter, Volume
+from sources.zh.uukanshu_sj import UukanshuOnlineSJ
+
+logger = logging.getLogger(__name__)
+
+novel_search_url = "%ssearch.aspx?k=%s"
+
+
+class UukanshuOnline(Crawler):
+    # www is simplified cn, tw is traditional cn but both use same site structure
+    base_url = ["https://www.uukanshu.net/", "https://tw.uukanshu.net/"]
+
+    encoding = "gbk"
+
+    def initialize(self):
+        # the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
+        self.init_parser("html.parser")
+
+    def read_novel_info(self) -> None:
+        # the encoding for tw is utf-8, for www. is gbk -> otherwise output is messed up with wrong symbols.
+        if "tw." in self.novel_url:
+            self.encoding = "utf-8"
+
+        soup = self.get_soup(self.novel_url, encoding=self.encoding)
+        info = soup.select_one("dl.jieshao")
+        assert info  # if this fails, HTML structure has fundamentally changed -> needs update
+        meta = info.select_one("dd.jieshao_content")
+
+        img = info.select_one("dt.jieshao-img img")
+        if img:
+            self.novel_cover = self.absolute_url(img["src"])
+
+        self.novel_title = meta.select_one("h1 > a").text
+        self.novel_author = meta.select_one("h2 > a").text
+        self.novel_synopsis = meta.select_one("h3 > p").text
+
+        chapters = soup.select_one("ul#chapterList")
+        for chapter in list(chapters.children)[::-1]:  # reverse order as it's newest to oldest
+            # convince typehint that we're looking at Tags & also make sure we skip random text within the ul if any
+            if not isinstance(chapter, Tag):
+                continue
+            # find chapters
+            if chapter.has_attr("class") and "volume" in chapter["class"]:
+                self.volumes.append(
+                    Volume(
+                        id=len(self.volumes) + 1,
+                        title=chapter.text.strip(),
+                    )
+                )
+                continue
+            anchor = chapter.select_one("a")
+            if not anchor:
+                logger.warning("Found <li> in chapter list, not volume, without link: %s", chapter)
+                continue
+            self.chapters.append(
+                Chapter(
+                    id=len(self.chapters) + 1,
+                    url=self.absolute_url(anchor["href"]),
+                    title=anchor.text,
+                    volume=len(self.volumes),
+                )
+            )
+
+    def download_chapter_body(self, chapter: Chapter) -> str:
+        soup = self.get_soup(chapter.url, encoding=self.encoding)
+        content = soup.select_one("div#contentbox")
+        # use same filters as already implemented on essentially same site
+        return UukanshuOnlineSJ.format_text(self.cleaner.extract_contents(content))
diff --git a/sources/zh/uukanshu_sj.py b/sources/zh/uukanshu_sj.py
index d930daae3..159d7326c 100644
--- a/sources/zh/uukanshu_sj.py
+++ b/sources/zh/uukanshu_sj.py
@@ -11,7 +11,7 @@
 chapter_list_url = "%s&page=%d"
 
 
-class UukanshuOnline(Crawler):
+class UukanshuOnlineSJ(Crawler):
     base_url = ["https://sj.uukanshu.net/"]  # previously .com, redirects .com to .net though
 
     def search_novel(self, query):
@@ -88,7 +88,8 @@ def download_chapter_body(self, chapter):
 
         return self.format_text(content)
 
-    def format_text(self, text):
+    @staticmethod
+    def format_text(text):
         text = re.sub(
             r"[UＵ][UＵ]\s*看书\s*[wｗ][wｗ][wｗ][\.．][uｕ][uｕ][kｋ][aａ][nｎ][sｓ][hｈ][uｕ][\.．][cｃ][oｏ][mｍ]",
             "",