From 8f08e6c33cb6465d691cc9105faade232ab0e9de Mon Sep 17 00:00:00 2001 From: Calibrain Date: Mon, 23 Dec 2024 00:15:57 +0000 Subject: [PATCH] Fix #20 : Network error when using alternative download streams --- backend.py | 2 +- book_manager.py | 126 ++++++++++++++++++++++++++++----------------- config.py | 5 ++ docker-compose.yml | 13 ++--- models.py | 3 +- network.py | 45 ++++++++++++++-- readme.md | 10 ++++ requirements.txt | 2 +- 8 files changed, 145 insertions(+), 61 deletions(-) diff --git a/backend.py b/backend.py index 7e80402..bd89855 100644 --- a/backend.py +++ b/backend.py @@ -132,7 +132,7 @@ def _download_book(book_id: str) -> bool: """ try: book_info = book_queue._book_data[book_id] - data = book_manager.download_book(book_id, book_info.title) + data = book_manager.download_book(book_info) if not data: raise Exception("No data received") diff --git a/book_manager.py b/book_manager.py index 46d9095..214f54a 100644 --- a/book_manager.py +++ b/book_manager.py @@ -1,14 +1,14 @@ """Book download manager handling search and retrieval operations.""" import time -from urllib.parse import urlparse, quote +from urllib.parse import quote from typing import List, Optional, Dict from bs4 import BeautifulSoup from io import BytesIO import json from logger import setup_logger -from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY +from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS from models import BookInfo import network @@ -28,7 +28,8 @@ def search_books(query: str) -> List[BookInfo]: """ query_html = quote(query) url = ( - f"https://annas-archive.org/search?index=&page=1&display=table" + f"{AA_BASE_URL}" + f"/search?index=&page=1&display=table" f"&acc=aa_download&acc=external_download&sort=" f"&ext={'&ext='.join(SUPPORTED_FORMATS)}&lang={'&lang='.join(BOOK_LANGUAGE)}&q={query_html}" ) @@ -98,22 +99,23 @@ def get_book_info(book_id: str) -> BookInfo: Returns: BookInfo: Detailed book information """ - url = f"https://annas-archive.org/md5/{book_id}" + url = f"{AA_BASE_URL}/md5/{book_id}" html = network.html_get_page(url) if not html: raise Exception(f"Failed to fetch book info for ID: {book_id}") soup = BeautifulSoup(html, 'html.parser') + + return _parse_book_info_page(soup, book_id) + +def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: + """Parse the book info page HTML into a BookInfo object.""" data = soup.select_one('body > main > div:nth-of-type(1)') if not data: raise Exception(f"Failed to parse book info for ID: {book_id}") - - return _parse_book_info_page(data, book_id) - -def _parse_book_info_page(data, book_id: str) -> BookInfo: - """Parse the book info page HTML into a BookInfo object.""" + preview = data.select_one( 'div:nth-of-type(1) > img' )['src'] @@ -138,6 +140,44 @@ def _parse_book_info_page(data, book_id: str) -> BookInfo: None ) + every_url = soup.find_all('a') + slow_urls_no_waitlist = set() + slow_urls_with_waitlist = set() + external_urls_libgen = set() + external_urls_z_lib = set() + + + for url in every_url: + try: + if url.parent.text.strip().lower().startswith("option #"): + if url.text.strip().lower().startswith("slow partner server"): + if url.next is not None and url.next.next is not None and "waitlist" in url.next.next.strip().lower(): + internal_text = url.next.next.strip().lower() + if "no waitlist" in internal_text: + slow_urls_no_waitlist.add(url['href']) + else: + slow_urls_with_waitlist.add(url['href']) + elif url.next is not None and url.next.next is not None and "click “GET” at the top" in url.next.next.text.strip(): + external_urls_libgen.add(url['href']) + elif url.text.strip().lower().startswith("z-lib"): + if ".onion/" not in url['href']: + external_urls_z_lib.add(url['href']) + except: + pass + + + slow_urls_no_waitlist = list(slow_urls_no_waitlist) + slow_urls_with_waitlist = list(slow_urls_with_waitlist) + external_urls_libgen = list(external_urls_libgen) + external_urls_z_lib = list(external_urls_z_lib) + + if USE_CF_BYPASS: + urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib + else: + urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist + for i in range(len(urls)): + urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i]) + # Extract basic information book_info = BookInfo( id=book_id, @@ -146,7 +186,8 @@ def _parse_book_info_page(data, book_id: str) -> BookInfo: publisher=divs[start_div_id + 1].next, author=divs[start_div_id + 2].next, format=format, - size=size + size=size, + download_urls=urls ) # Extract additional metadata @@ -198,7 +239,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]: and "filename" not in k.lower() } -def download_book(book_id: str, title: str) -> Optional[BytesIO]: +def download_book(book_info: BookInfo) -> Optional[BytesIO]: """Download a book from available sources. Args: @@ -209,27 +250,24 @@ def download_book(book_id: str, title: str) -> Optional[BytesIO]: Optional[BytesIO]: Book content buffer if successful """ - download_links = [ - f"https://annas-archive.org/slow_download/{book_id}/0/2", - f"https://libgen.li/ads.php?md5={book_id}", - f"https://library.lol/fiction/{book_id}", - f"https://library.lol/main/{book_id}", - f"https://annas-archive.org/slow_download/{book_id}/0/0", - f"https://annas-archive.org/slow_download/{book_id}/0/1" - ] - """If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources.""" + + if len(book_info.download_urls) == 0: + book_info = get_book_info(book_info.id) + download_links = book_info.download_urls + + # If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources. if AA_DONATOR_KEY is not None: download_links.insert(0, - f"https://annas-archive.org/dyn/api/fast_download.json?md5={book_id}&key={AA_DONATOR_KEY}" + f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}" ) - + for link in download_links: try: - download_url = _get_download_url(link, title) + download_url = _get_download_url(link, book_info.title) if download_url: - logger.info(f"Downloading {title} from {download_url}") - return network.download_url(download_url) + logger.info(f"Downloading {book_info.title} from {download_url}") + return network.download_url(download_url, book_info.size) except Exception as e: logger.error(f"Failed to download from {link}: {e}") continue @@ -239,35 +277,27 @@ def download_book(book_id: str, title: str) -> Optional[BytesIO]: def _get_download_url(link: str, title: str) -> Optional[str]: """Extract actual download URL from various source pages.""" - if link.startswith("https://annas-archive.org/dyn/api/fast_download.json"): + if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"): page = network.html_get_page(link) return json.loads(page).get("download_url") - html = network.html_get_page_cf(link) + + try: + html = network.html_get_page(link, retry=0, skip_403=True) + except: + html = network.html_get_page_cf(link) + if not html: return None - + soup = BeautifulSoup(html, 'html.parser') + url = None if link.startswith("https://z-lib.gs"): download_link = soup.find_all('a', href=True, class_="addDownloadedBook") if download_link: - parsed = urlparse(download_link[0]['href']) - return f"{parsed.scheme}://{parsed.netloc}{download_link[0]['href']}" - - elif link.startswith("https://libgen.li"): - get_section = soup.find_all('h2', string="GET") - if get_section: - href = get_section[0].parent['href'] - parsed = urlparse(href) - return f"{parsed.scheme}://{parsed.netloc}/{href}" - - elif link.startswith("https://library.lol/fiction/"): - get_section = soup.find_all('h2', string="GET") - if get_section: - return get_section[0].parent['href'] - - elif link.startswith("https://annas-archive.org/slow_download/"): + url = download_link[0]['href'] + elif link.startswith(f"{AA_BASE_URL}/slow_download/"): download_links = soup.find_all('a', href=True, string="📚 Download now") if not download_links: countdown = soup.find_all('span', class_="js-partner-countdown") @@ -277,6 +307,8 @@ def _get_download_url(link: str, title: str) -> Optional[str]: time.sleep(sleep_time + 5) return _get_download_url(link, title) else: - return download_links[0]['href'] - - return None + url = download_links[0]['href'] + else: + url = soup.find_all('a', string="GET")[0]['href'] + + return network.get_absolute_url(link, url) diff --git a/config.py b/config.py index 731508f..612bbc9 100644 --- a/config.py +++ b/config.py @@ -26,7 +26,12 @@ MAX_RETRY = int(os.getenv("MAX_RETRY", 3)) DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5)) CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000") +USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower() +USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"] + +# Anna's Archive settings AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None) +AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/") # File format settings SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr") diff --git a/docker-compose.yml b/docker-compose.yml index c1179ac..de22948 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,12 +10,13 @@ services: BOOK_LANGUAGE: en ports: - 8084:8084 - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"] - interval: 30s - timeout: 30s - retries: 3 - start_period: 5s + # Uncomment the following lines if you want to enable healthcheck + #healthcheck: + # test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"] + # interval: 30s + # timeout: 30s + # retries: 3 + # start_period: 5s restart: unless-stopped volumes: # This is where the books will be downloaded to, usually it would be diff --git a/models.py b/models.py index 4613c43..340f919 100644 --- a/models.py +++ b/models.py @@ -1,6 +1,6 @@ """Data structures and models used across the application.""" -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Dict, List, Optional from enum import Enum from config import INGEST_DIR, STATUS_TIMEOUT @@ -27,6 +27,7 @@ class BookInfo: format: Optional[str] = None size: Optional[str] = None info: Optional[Dict[str, List[str]]] = None + download_urls: Optional[List[str]] = field(default_factory=list) class BookQueue: """Thread-safe book queue manager.""" diff --git a/network.py b/network.py index 561597b..cf03ada 100644 --- a/network.py +++ b/network.py @@ -5,9 +5,11 @@ from io import BytesIO import urllib.request from typing import Optional +from urllib.parse import urlparse +from tqdm import tqdm from logger import setup_logger -from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, AA_DONATOR_KEY +from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, USE_CF_BYPASS logger = setup_logger(__name__) @@ -23,7 +25,7 @@ def setup_urllib_opener(): setup_urllib_opener() -def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False) -> Optional[str]: +def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> Optional[str]: """Fetch HTML content from a URL with retry mechanism. Args: @@ -41,6 +43,10 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False) -> O if skip_404 and response.status_code == 404: logger.warning(f"404 error for URL: {url}") return None + + if skip_403 and response.status_code == 403: + logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.") + return None response.raise_for_status() time.sleep(1) @@ -68,6 +74,9 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]: Returns: str: HTML content if successful, None otherwise """ + if USE_CF_BYPASS == False: + logger.warning("Cloudflare bypass is disabled, trying without it.") + return html_get_page(url, retry, skip_403=True) try: logger.info(f"GET_CF: {url}") response = requests.get( @@ -88,7 +97,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]: time.sleep(sleep_time) return html_get_page_cf(url, retry - 1) -def download_url(link: str) -> Optional[BytesIO]: +def download_url(link: str, size: str = None) -> Optional[BytesIO]: """Download content from URL into a BytesIO buffer. Args: @@ -101,11 +110,37 @@ def download_url(link: str) -> Optional[BytesIO]: logger.info(f"Downloading from: {link}") response = requests.get(link, stream=True) response.raise_for_status() + + try: + total_size = size.strip().replace(" ", "").replace(",", ".").upper() + # we assume size is in MB + total_size = int(float(total_size[:-2].strip()) * 1024 * 1024) + except: + total_size = int(response.headers.get('content-length', 0)) buffer = BytesIO() - buffer.write(response.content) + for chunk in tqdm(response.iter_content(chunk_size=1024), total=total_size, unit='B', unit_scale=True, unit_divisor=1024): + buffer.write(chunk) + buffer.seek(0) return buffer except requests.exceptions.RequestException as e: logger.error(f"Failed to download from {link}: {e}") - return None \ No newline at end of file + return None + +def get_absolute_url(base_url: str, url: str) -> str: + """Get absolute URL from relative URL and base URL. + + Args: + base_url: Base URL + url: Relative URL + """ + if url == None or url.strip() == "": + return None + if url.startswith("http"): + return url + parsed_url = urlparse(url) + parsed_base = urlparse(base_url) + if parsed_url.netloc == "" or parsed_url.scheme == "": + parsed_url = parsed_url._replace(netloc=parsed_base.netloc, scheme=parsed_base.scheme) + return parsed_url.geturl() diff --git a/readme.md b/readme.md index 413efd0..887ec6f 100644 --- a/readme.md +++ b/readme.md @@ -73,7 +73,15 @@ An intuitive web interface for searching and requesting book downloads, designed Note that PDF are NOT supported at the moment (they do not get ingested by CWA, but if you want to just download them locally, you can add `pdf` to the `SUPPORTED_FORMATS` env +#### AA + +| Variable | Description | Default Value | +| ---------------------- | --------------------------------------------------------- | --------------------------------- | +| `AA_BASE_URL` | Base URL of Annas-Archive (could be changed for a proxy) | `https://annas-archive.org` | +| `USE_CF_BYPASS` | Disable CF bypass and use alternative links instead | `true` | + If you are a donator on AA, you can use your Key in `AA_DONATOR_API_KEY` to speed up downloads and bypass the wait times. +If diabling the cloduflare bypass, you will be using alternative download hosts, such as libgen or z-lib, but they usually have a delay before getting the more recent books and their collection is not as big as aa's. But this setting should work for the majority of books. #### Network Settings @@ -82,6 +90,8 @@ If you are a donator on AA, you can use your Key in `AA_DONATOR_API_KEY` to spee | `CLOUDFLARE_PROXY_URL` | Cloudflare bypass service URL | `http://localhost:8000` | | `PORT` | Container external port | `8084` | +`CLOUDFLARE_PROXY_URL` is ignored if `USE_CF_BYPASS` is set to `false` + ### Volume Configuration ```yaml diff --git a/requirements.txt b/requirements.txt index 29c3caa..49a6329 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ flask requests beautifulsoup4 -tqdm \ No newline at end of file +tqdm