From 8f08e6c33cb6465d691cc9105faade232ab0e9de Mon Sep 17 00:00:00 2001
From: Calibrain <calibrain@l4n.xyz>
Date: Mon, 23 Dec 2024 00:15:57 +0000
Subject: [PATCH] Fix #20 : Network error when using alternative download
 streams

---
 backend.py         |   2 +-
 book_manager.py    | 126 ++++++++++++++++++++++++++++-----------------
 config.py          |   5 ++
 docker-compose.yml |  13 ++---
 models.py          |   3 +-
 network.py         |  45 ++++++++++++++--
 readme.md          |  10 ++++
 requirements.txt   |   2 +-
 8 files changed, 145 insertions(+), 61 deletions(-)

diff --git a/backend.py b/backend.py
index 7e80402..bd89855 100644
--- a/backend.py
+++ b/backend.py
@@ -132,7 +132,7 @@ def _download_book(book_id: str) -> bool:
     """
     try:
         book_info = book_queue._book_data[book_id]
-        data = book_manager.download_book(book_id, book_info.title)
+        data = book_manager.download_book(book_info)
         
         if not data:
             raise Exception("No data received")
diff --git a/book_manager.py b/book_manager.py
index 46d9095..214f54a 100644
--- a/book_manager.py
+++ b/book_manager.py
@@ -1,14 +1,14 @@
 """Book download manager handling search and retrieval operations."""
 
 import time
-from urllib.parse import urlparse, quote
+from urllib.parse import quote
 from typing import List, Optional, Dict
 from bs4 import BeautifulSoup
 from io import BytesIO
 import json
 
 from logger import setup_logger
-from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY
+from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS
 from models import BookInfo
 import network
 
@@ -28,7 +28,8 @@ def search_books(query: str) -> List[BookInfo]:
     """
     query_html = quote(query)
     url = (
-        f"https://annas-archive.org/search?index=&page=1&display=table"
+        f"{AA_BASE_URL}"
+        f"/search?index=&page=1&display=table"
         f"&acc=aa_download&acc=external_download&sort="
         f"&ext={'&ext='.join(SUPPORTED_FORMATS)}&lang={'&lang='.join(BOOK_LANGUAGE)}&q={query_html}"
     )
@@ -98,22 +99,23 @@ def get_book_info(book_id: str) -> BookInfo:
     Returns:
         BookInfo: Detailed book information
     """
-    url = f"https://annas-archive.org/md5/{book_id}"
+    url = f"{AA_BASE_URL}/md5/{book_id}"
     html = network.html_get_page(url)
     
     if not html:
         raise Exception(f"Failed to fetch book info for ID: {book_id}")
 
     soup = BeautifulSoup(html, 'html.parser')
+
+    return _parse_book_info_page(soup, book_id)
+
+def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
+    """Parse the book info page HTML into a BookInfo object."""
     data = soup.select_one('body > main > div:nth-of-type(1)')
     
     if not data:
         raise Exception(f"Failed to parse book info for ID: {book_id}")
-
-    return _parse_book_info_page(data, book_id)
-
-def _parse_book_info_page(data, book_id: str) -> BookInfo:
-    """Parse the book info page HTML into a BookInfo object."""
+    
     preview = data.select_one(
         'div:nth-of-type(1) > img'
     )['src']
@@ -138,6 +140,44 @@ def _parse_book_info_page(data, book_id: str) -> BookInfo:
         None
     )
 
+    every_url = soup.find_all('a')
+    slow_urls_no_waitlist = set()
+    slow_urls_with_waitlist = set()
+    external_urls_libgen = set()
+    external_urls_z_lib = set()
+
+
+    for url in every_url:
+        try:
+            if url.parent.text.strip().lower().startswith("option #"):
+                if url.text.strip().lower().startswith("slow partner server"):
+                    if url.next is not None and url.next.next is not None and "waitlist" in url.next.next.strip().lower():
+                        internal_text = url.next.next.strip().lower()
+                        if "no waitlist" in internal_text:
+                            slow_urls_no_waitlist.add(url['href'])
+                        else:
+                            slow_urls_with_waitlist.add(url['href'])
+                elif url.next is not None and url.next.next is not None and "click “GET” at the top" in url.next.next.text.strip():
+                    external_urls_libgen.add(url['href'])
+                elif url.text.strip().lower().startswith("z-lib"):
+                    if ".onion/" not in url['href']:
+                        external_urls_z_lib.add(url['href'])
+        except:
+            pass
+        
+    
+    slow_urls_no_waitlist = list(slow_urls_no_waitlist)
+    slow_urls_with_waitlist = list(slow_urls_with_waitlist)
+    external_urls_libgen = list(external_urls_libgen)
+    external_urls_z_lib = list(external_urls_z_lib)
+
+    if USE_CF_BYPASS:
+        urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib
+    else:
+        urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist
+    for i in range(len(urls)):
+        urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i])
+
     # Extract basic information
     book_info = BookInfo(
         id=book_id,
@@ -146,7 +186,8 @@ def _parse_book_info_page(data, book_id: str) -> BookInfo:
         publisher=divs[start_div_id + 1].next,
         author=divs[start_div_id + 2].next,
         format=format,
-        size=size
+        size=size,
+        download_urls=urls
     )
 
     # Extract additional metadata
@@ -198,7 +239,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
         and "filename" not in k.lower()
     }
 
-def download_book(book_id: str, title: str) -> Optional[BytesIO]:
+def download_book(book_info: BookInfo) -> Optional[BytesIO]:
     """Download a book from available sources.
     
     Args:
@@ -209,27 +250,24 @@ def download_book(book_id: str, title: str) -> Optional[BytesIO]:
         Optional[BytesIO]: Book content buffer if successful
     """
 
-    download_links = [
-        f"https://annas-archive.org/slow_download/{book_id}/0/2",
-        f"https://libgen.li/ads.php?md5={book_id}",
-        f"https://library.lol/fiction/{book_id}",
-        f"https://library.lol/main/{book_id}",
-        f"https://annas-archive.org/slow_download/{book_id}/0/0",
-        f"https://annas-archive.org/slow_download/{book_id}/0/1"
-    ]
 
-    """If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources."""
+
+    if len(book_info.download_urls) == 0:
+        book_info = get_book_info(book_info.id)
+    download_links = book_info.download_urls
+
+    # If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources.
     if AA_DONATOR_KEY is not None:
         download_links.insert(0, 
-            f"https://annas-archive.org/dyn/api/fast_download.json?md5={book_id}&key={AA_DONATOR_KEY}"
+            f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}"
         )
-
+    
     for link in download_links:
         try:
-            download_url = _get_download_url(link, title)
+            download_url = _get_download_url(link, book_info.title)
             if download_url:
-                logger.info(f"Downloading {title} from {download_url}")
-                return network.download_url(download_url)
+                logger.info(f"Downloading {book_info.title} from {download_url}")
+                return network.download_url(download_url, book_info.size)
         except Exception as e:
             logger.error(f"Failed to download from {link}: {e}")
             continue
@@ -239,35 +277,27 @@ def download_book(book_id: str, title: str) -> Optional[BytesIO]:
 def _get_download_url(link: str, title: str) -> Optional[str]:
     """Extract actual download URL from various source pages."""
 
-    if link.startswith("https://annas-archive.org/dyn/api/fast_download.json"):
+    if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"):
         page = network.html_get_page(link)
         return json.loads(page).get("download_url")
     
-    html = network.html_get_page_cf(link)
+
+    try:
+        html = network.html_get_page(link, retry=0, skip_403=True)
+    except:
+        html = network.html_get_page_cf(link)
+    
     if not html:
         return None
-
+    
     soup = BeautifulSoup(html, 'html.parser')
+    url = None
     
     if link.startswith("https://z-lib.gs"):
         download_link = soup.find_all('a', href=True, class_="addDownloadedBook")
         if download_link:
-            parsed = urlparse(download_link[0]['href'])
-            return f"{parsed.scheme}://{parsed.netloc}{download_link[0]['href']}"
-            
-    elif link.startswith("https://libgen.li"):
-        get_section = soup.find_all('h2', string="GET")
-        if get_section:
-            href = get_section[0].parent['href']
-            parsed = urlparse(href)
-            return f"{parsed.scheme}://{parsed.netloc}/{href}"
-            
-    elif link.startswith("https://library.lol/fiction/"):
-        get_section = soup.find_all('h2', string="GET")
-        if get_section:
-            return get_section[0].parent['href']
-            
-    elif link.startswith("https://annas-archive.org/slow_download/"):
+            url = download_link[0]['href']            
+    elif link.startswith(f"{AA_BASE_URL}/slow_download/"):
         download_links = soup.find_all('a', href=True, string="📚 Download now")
         if not download_links:
             countdown = soup.find_all('span', class_="js-partner-countdown")
@@ -277,6 +307,8 @@ def _get_download_url(link: str, title: str) -> Optional[str]:
                 time.sleep(sleep_time + 5)
                 return _get_download_url(link, title)
         else:
-            return download_links[0]['href']
-            
-    return None
+            url = download_links[0]['href']
+    else:
+        url = soup.find_all('a', string="GET")[0]['href']
+
+    return network.get_absolute_url(link, url)
diff --git a/config.py b/config.py
index 731508f..612bbc9 100644
--- a/config.py
+++ b/config.py
@@ -26,7 +26,12 @@
 MAX_RETRY = int(os.getenv("MAX_RETRY", 3))
 DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5))
 CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000")
+USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower()
+USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"]
+
+# Anna's Archive settings
 AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None)
+AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/")
 
 # File format settings
 SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr")
diff --git a/docker-compose.yml b/docker-compose.yml
index c1179ac..de22948 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -10,12 +10,13 @@ services:
       BOOK_LANGUAGE: en
     ports:
       - 8084:8084
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"]
-      interval: 30s
-      timeout: 30s
-      retries: 3
-      start_period: 5s
+    # Uncomment the following lines if you want to enable healthcheck
+    #healthcheck:
+    #  test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"]
+    #  interval: 30s
+    #  timeout: 30s
+    #  retries: 3
+    #  start_period: 5s
     restart: unless-stopped
     volumes:
     # This is where the books will be downloaded to, usually it would be 
diff --git a/models.py b/models.py
index 4613c43..340f919 100644
--- a/models.py
+++ b/models.py
@@ -1,6 +1,6 @@
 """Data structures and models used across the application."""
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Dict, List, Optional
 from enum import Enum
 from config import INGEST_DIR, STATUS_TIMEOUT
@@ -27,6 +27,7 @@ class BookInfo:
     format: Optional[str] = None
     size: Optional[str] = None
     info: Optional[Dict[str, List[str]]] = None
+    download_urls: Optional[List[str]] = field(default_factory=list)
 
 class BookQueue:
     """Thread-safe book queue manager."""
diff --git a/network.py b/network.py
index 561597b..cf03ada 100644
--- a/network.py
+++ b/network.py
@@ -5,9 +5,11 @@
 from io import BytesIO
 import urllib.request
 from typing import Optional
+from urllib.parse import urlparse
+from tqdm import tqdm
 
 from logger import setup_logger
-from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, AA_DONATOR_KEY
+from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, USE_CF_BYPASS
 
 logger = setup_logger(__name__)
 
@@ -23,7 +25,7 @@ def setup_urllib_opener():
 
 setup_urllib_opener()
 
-def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False) -> Optional[str]:
+def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> Optional[str]:
     """Fetch HTML content from a URL with retry mechanism.
     
     Args:
@@ -41,6 +43,10 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False) -> O
         if skip_404 and response.status_code == 404:
             logger.warning(f"404 error for URL: {url}")
             return None
+        
+        if skip_403 and response.status_code == 403:
+            logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.")
+            return None
             
         response.raise_for_status()
         time.sleep(1)
@@ -68,6 +74,9 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
     Returns:
         str: HTML content if successful, None otherwise
     """
+    if USE_CF_BYPASS == False:
+        logger.warning("Cloudflare bypass is disabled, trying without it.")
+        return html_get_page(url, retry, skip_403=True)
     try:
         logger.info(f"GET_CF: {url}")
         response = requests.get(
@@ -88,7 +97,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
         time.sleep(sleep_time)
         return html_get_page_cf(url, retry - 1)
 
-def download_url(link: str) -> Optional[BytesIO]:
+def download_url(link: str, size: str = None) -> Optional[BytesIO]:
     """Download content from URL into a BytesIO buffer.
     
     Args:
@@ -101,11 +110,37 @@ def download_url(link: str) -> Optional[BytesIO]:
         logger.info(f"Downloading from: {link}")
         response = requests.get(link, stream=True)
         response.raise_for_status()
+
+        try:
+            total_size = size.strip().replace(" ", "").replace(",", ".").upper()
+            # we assume size is in MB
+            total_size = int(float(total_size[:-2].strip()) * 1024 * 1024)
+        except:
+            total_size = int(response.headers.get('content-length', 0))
         
         buffer = BytesIO()
-        buffer.write(response.content)
+        for chunk in tqdm(response.iter_content(chunk_size=1024), total=total_size, unit='B', unit_scale=True, unit_divisor=1024):
+            buffer.write(chunk)
+        buffer.seek(0)
         return buffer
         
     except requests.exceptions.RequestException as e:
         logger.error(f"Failed to download from {link}: {e}")
-        return None
\ No newline at end of file
+        return None
+
+def get_absolute_url(base_url: str, url: str) -> str:
+    """Get absolute URL from relative URL and base URL.
+    
+    Args:
+        base_url: Base URL
+        url: Relative URL
+    """
+    if url == None or url.strip() == "":
+        return None
+    if url.startswith("http"):
+        return url
+    parsed_url = urlparse(url)
+    parsed_base = urlparse(base_url)
+    if parsed_url.netloc == "" or parsed_url.scheme == "":
+        parsed_url = parsed_url._replace(netloc=parsed_base.netloc, scheme=parsed_base.scheme)
+    return parsed_url.geturl()
diff --git a/readme.md b/readme.md
index 413efd0..887ec6f 100644
--- a/readme.md
+++ b/readme.md
@@ -73,7 +73,15 @@ An intuitive web interface for searching and requesting book downloads, designed
 
 Note that PDF are NOT supported at the moment (they do not get ingested by CWA, but if you want to just download them locally, you can add `pdf` to the `SUPPORTED_FORMATS` env
 
+#### AA 
+
+| Variable               | Description                                               | Default Value                     |
+| ---------------------- | --------------------------------------------------------- | --------------------------------- |
+| `AA_BASE_URL`          | Base URL of Annas-Archive (could be changed for a proxy)  | `https://annas-archive.org`       |
+| `USE_CF_BYPASS`        | Disable CF bypass and use alternative links instead       | `true`                           |
+
 If you are a donator on AA, you can use your Key in `AA_DONATOR_API_KEY` to speed up downloads and bypass the wait times.
+If diabling the cloduflare bypass, you will be using alternative download hosts, such as libgen or z-lib, but they usually have a delay before getting the more recent books and their collection is not as big as aa's. But this setting should work for the majority of books.
 
 #### Network Settings
 
@@ -82,6 +90,8 @@ If you are a donator on AA, you can use your Key in `AA_DONATOR_API_KEY` to spee
 | `CLOUDFLARE_PROXY_URL` | Cloudflare bypass service URL | `http://localhost:8000` |
 | `PORT`                 | Container external port       | `8084`                  |
 
+`CLOUDFLARE_PROXY_URL` is ignored if `USE_CF_BYPASS` is set to `false`
+
 ### Volume Configuration
 
 ```yaml
diff --git a/requirements.txt b/requirements.txt
index 29c3caa..49a6329 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 flask
 requests
 beautifulsoup4
-tqdm
\ No newline at end of file
+tqdm