From b8a72475381c9245dfccac539b4e882f26d74e8b Mon Sep 17 00:00:00 2001 From: CaliBrain Date: Sun, 22 Dec 2024 22:11:35 -0500 Subject: [PATCH] [FIX] Refactor code to fix typing (#23) This also fixes some bug with the CF bypasser and the file checker --- Dockerfile | 2 ++ app.py | 6 ++-- backend.py | 23 ++++++-------- book_manager.py | 83 +++++++++++++++++++++++++++--------------------- check_health.sh | 27 +++------------- config.py | 20 ++++-------- logger.py | 6 ++-- models.py | 6 ++-- network.py | 49 +++++++++++++++------------- requirements.txt | 3 ++ 10 files changed, 107 insertions(+), 118 deletions(-) diff --git a/Dockerfile b/Dockerfile index ef04af7..ce4dfa7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,8 @@ ENV CLOUDFLARE_PROXY_URL=http://localhost:8000 ENV INGEST_DIR=/cwa-book-ingest ENV STATUS_TIMEOUT=3600 ENV PYTHONPATH=/app +ENV USE_CF_BYPASS=true +ENV AA_BASE_URL=https://annas-archive.org # Default UID and GID (can be overridden at runtime) ENV UID=1000 diff --git a/app.py b/app.py index 739a9c2..09076cd 100644 --- a/app.py +++ b/app.py @@ -12,7 +12,7 @@ logger = setup_logger(__name__) app = Flask(__name__) -app.wsgi_app = ProxyFix(app.wsgi_app) +app.wsgi_app = ProxyFix(app.wsgi_app) # type: ignore app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 # Disable caching app.config['APPLICATION_ROOT'] = '/' @@ -171,12 +171,10 @@ def api_local_download(): return jsonify({"error": "No book ID provided"}), 400 try: - file_data = backend.get_book_data(book_id) + file_data, file_name = backend.get_book_data(book_id) if file_data is None: # Book data not found or not available return jsonify({"error": "File not found"}), 404 - - file_data, file_name = file_data # Santize the file name file_name = re.sub(r'[\\/:*?"<>|]', '_', file_name.strip())[:255] # Prepare the file for sending to the client diff --git a/backend.py b/backend.py index bd89855..4818b3a 100644 --- a/backend.py +++ b/backend.py @@ -3,7 +3,7 @@ import threading, time import subprocess from pathlib import Path -from typing import Dict, List, Optional, Any +from typing import Dict, List, Optional, Any, Tuple from logger import setup_logger from config import TMP_DIR, MAIN_LOOP_SLEEP_TIME, INGEST_DIR @@ -75,14 +75,14 @@ def queue_status() -> Dict[str, Dict[str, Any]]: for status_type, books in status.items() } -def get_book_data(book_id: str) -> Optional[bytes]: - """Get book data for a specific book. +def get_book_data(book_id: str) -> Tuple[Optional[bytes], str] : + """Get book data for a specific book, including its title. Args: book_id: Book identifier Returns: - Optional[bytes]: Book data if available + Tuple[Optional[bytes], str]: Book data if available, and the book title """ try: book_info = book_queue._book_data[book_id] @@ -91,7 +91,7 @@ def get_book_data(book_id: str) -> Optional[bytes]: return f.read(), book_info.title except Exception as e: logger.error(f"Error getting book data: {e}") - return None + return None, "" def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]: """Convert BookInfo object to dictionary representation.""" @@ -110,12 +110,14 @@ def _process_book(book_path: str) -> bool: bool: True if book is valid """ try: + logger.info(f"Verifying book health: {book_path}") script_path = Path(__file__).parent / "check_health.sh" result = subprocess.run( [str(script_path), book_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + logger.info(f"Health check result: {result.stdout.decode()}") return result.returncode == 0 except Exception as e: logger.error(f"Error checking book health: {e}") @@ -132,15 +134,10 @@ def _download_book(book_id: str) -> bool: """ try: book_info = book_queue._book_data[book_id] - data = book_manager.download_book(book_info) - - if not data: - raise Exception("No data received") - book_path = TMP_DIR / f"{book_id}.{book_info.format}" - with open(book_path, "wb") as f: - f.write(data.getbuffer()) - + success = book_manager.download_book(book_info, book_path) + if not success: + raise Exception("Unkown error downloading book") return _process_book(str(book_path)) except Exception as e: diff --git a/book_manager.py b/book_manager.py index 214f54a..3773f3c 100644 --- a/book_manager.py +++ b/book_manager.py @@ -1,11 +1,11 @@ """Book download manager handling search and retrieval operations.""" -import time +import time, json +from pathlib import Path from urllib.parse import quote from typing import List, Optional, Dict -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag, NavigableString from io import BytesIO -import json from logger import setup_logger from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS @@ -43,20 +43,21 @@ def search_books(query: str) -> List[BookInfo]: raise Exception("No books found. Please try another query.") soup = BeautifulSoup(html, 'html.parser') - tbody = soup.find('table') + tbody: Tag | NavigableString | None = soup.find('table') if not tbody: logger.warning(f"No results table found for query: {query}") raise Exception("No books found. Please try another query.") books = [] - for line_tr in tbody.find_all('tr'): - try: - book = _parse_search_result_row(line_tr) - if book: - books.append(book) - except Exception as e: - logger.error(f"Failed to parse search result row: {e}") + if isinstance(tbody, Tag): + for line_tr in tbody.find_all('tr'): + try: + book = _parse_search_result_row(line_tr) + if book: + books.append(book) + except Exception as e: + logger.error(f"Failed to parse search result row: {e}") books.sort( key=lambda x: ( @@ -116,9 +117,17 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: if not data: raise Exception(f"Failed to parse book info for ID: {book_id}") - preview = data.select_one( + preview: str = "" + + node = data.select_one( 'div:nth-of-type(1) > img' - )['src'] + ) + if node: + preview_value = node.get('src', "") + if isinstance(preview_value, list): + preview = preview_value[0] + else: + preview = preview_value # Find the start of book information divs = data.find_all('div') @@ -164,17 +173,12 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: external_urls_z_lib.add(url['href']) except: pass - - - slow_urls_no_waitlist = list(slow_urls_no_waitlist) - slow_urls_with_waitlist = list(slow_urls_with_waitlist) - external_urls_libgen = list(external_urls_libgen) - external_urls_z_lib = list(external_urls_z_lib) if USE_CF_BYPASS: - urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib + urls = list(slow_urls_no_waitlist) + list(external_urls_libgen) + list(slow_urls_with_waitlist) + list(external_urls_z_lib) else: - urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist + urls = list(external_urls_libgen) + list(external_urls_z_lib) + list(slow_urls_no_waitlist) + list(slow_urls_with_waitlist) + for i in range(len(urls)): urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i]) @@ -204,7 +208,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo: def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]: """Extract metadata from book info divs.""" - info = {} + info : Dict[str, List[str]] = {} # Process the first set of metadata sub_data = metadata_divs[0].find_all('div') @@ -239,7 +243,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]: and "filename" not in k.lower() } -def download_book(book_info: BookInfo) -> Optional[BytesIO]: +def download_book(book_info: BookInfo, book_path: Path) -> bool: """Download a book from available sources. Args: @@ -250,14 +254,12 @@ def download_book(book_info: BookInfo) -> Optional[BytesIO]: Optional[BytesIO]: Book content buffer if successful """ - - if len(book_info.download_urls) == 0: book_info = get_book_info(book_info.id) download_links = book_info.download_urls # If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources. - if AA_DONATOR_KEY is not None: + if AA_DONATOR_KEY != "": download_links.insert(0, f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}" ) @@ -265,33 +267,40 @@ def download_book(book_info: BookInfo) -> Optional[BytesIO]: for link in download_links: try: download_url = _get_download_url(link, book_info.title) - if download_url: + if download_url != "": logger.info(f"Downloading {book_info.title} from {download_url}") - return network.download_url(download_url, book_info.size) + data = network.download_url(download_url, book_info.size or "") + if not data: + raise Exception("No data received") + + logger.info(f"Download finished. Writing to {book_path}") + with open(book_path, "wb") as f: + f.write(data.getbuffer()) + logger.info(f"Writing {book_info.title} successfully") + return True + except Exception as e: logger.error(f"Failed to download from {link}: {e}") continue - return None + return False -def _get_download_url(link: str, title: str) -> Optional[str]: +def _get_download_url(link: str, title: str) -> str: """Extract actual download URL from various source pages.""" if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"): page = network.html_get_page(link) return json.loads(page).get("download_url") - - try: - html = network.html_get_page(link, retry=0, skip_403=True) - except: + html = network.html_get_page(link, retry=0, skip_403=True) + if html == "": html = network.html_get_page_cf(link) - if not html: - return None + if html == "": + return "" soup = BeautifulSoup(html, 'html.parser') - url = None + url = "" if link.startswith("https://z-lib.gs"): download_link = soup.find_all('a', href=True, class_="addDownloadedBook") diff --git a/check_health.sh b/check_health.sh index 9ca7c92..774e5a2 100644 --- a/check_health.sh +++ b/check_health.sh @@ -35,30 +35,11 @@ for file in "${files[@]}"; do fileextension="${filenamewithext##*.}" case "$fileextension" in - epub) - # Check if the EPUB file is a valid archive - 7z t "$file" >/dev/null 2>&1; - exit_code=$? - if [ "$exit_code" -eq 0 ] || [ "$exit_code" -eq 1 ]; then - mv "$file" "$OUTPUTFOLDER/$filenamewithext" - good=$((good + 1)) - else - ebook-convert "$file" /tmp/tmpepub.epub >/dev/null 2>&1 - exit_code=$? - rm -f /tmp/tmpepub.epub - if [ "$exit_code" -eq 0 ]; then - mv "$file" "$OUTPUTFOLDER/$filenamewithext" - good=$((good + 1)) - else - rm "$file" - bad=$((bad + 1)) - fi - fi - ;; - mobi|azw3|fb2|djvu|cbz|cbr) + epub|mobi|azw3|fb2|djvu|cbz|cbr) # Attempt to convert the file to EPUB ebook-convert "$file" "$OUTPUTFOLDER/$filename.epub" >/dev/null 2>&1 - if [ "$exit_code" -eq 0 ]; then + # if file exists in $OUTPUTFOLDER/$filename.epub then it is a good file + if [ -f "$OUTPUTFOLDER/$filename.epub" ]; then good=$((good + 1)) else bad=$((bad + 1)) @@ -82,6 +63,6 @@ if [ "$bad" -gt 0 ]; then exit 2 fi if [ "$manual" -gt 0 ]; then - exut 1 + exit 1 fi exit 0 diff --git a/config.py b/config.py index 612bbc9..31e1a90 100644 --- a/config.py +++ b/config.py @@ -7,14 +7,11 @@ # Directory settings BASE_DIR = Path(__file__).resolve().parent -LOG_DIR = "/var/logs" -LOG_DIR = Path(LOG_DIR) +LOG_DIR = Path("/var/logs") -TMP_DIR = os.getenv("TMP_DIR", "/tmp/cwa-book-downloader") -TMP_DIR = Path(TMP_DIR) +TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")) -INGEST_DIR = os.getenv("INGEST_DIR", "/cwa-book-ingest") -INGEST_DIR = Path(INGEST_DIR) +INGEST_DIR = Path(os.getenv("INGEST_DIR", "/tmp/cwa-book-ingest")) STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", 3600)) # Create necessary directories @@ -26,19 +23,16 @@ MAX_RETRY = int(os.getenv("MAX_RETRY", 3)) DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5)) CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000") -USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower() -USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"] +USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "true").lower() in ["true", "yes", "1", "y"] # Anna's Archive settings -AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None) +AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", "").strip() AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/") # File format settings -SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr") -SUPPORTED_FORMATS = SUPPORTED_FORMATS.split(",") +SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr").split(",") -BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en") -BOOK_LANGUAGE = BOOK_LANGUAGE.lower().split(',') +BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en").lower().split(',') BOOK_LANGUAGE = [l for l in BOOK_LANGUAGE if l in _SUPPORTED_BOOK_LANGUAGE] if len(BOOK_LANGUAGE) == 0: BOOK_LANGUAGE = ['en'] diff --git a/logger.py b/logger.py index 432d6d8..8153f30 100644 --- a/logger.py +++ b/logger.py @@ -5,7 +5,7 @@ from logging.handlers import RotatingFileHandler from config import FLASK_DEBUG -def setup_logger(name: str, log_file: str = None) -> logging.Logger: +def setup_logger(name: str, log_file: str = "") -> logging.Logger: """Set up and configure a logger instance. Args: @@ -39,9 +39,9 @@ def setup_logger(name: str, log_file: str = None) -> logging.Logger: logger.addHandler(error_handler) # File handler if log file is specified - if log_file: + if log_file.strip() != "": file_handler = RotatingFileHandler( - log_file, + log_file.strip(), maxBytes=10485760, # 10MB backupCount=5 ) diff --git a/models.py b/models.py index 340f919..e703914 100644 --- a/models.py +++ b/models.py @@ -27,7 +27,7 @@ class BookInfo: format: Optional[str] = None size: Optional[str] = None info: Optional[Dict[str, List[str]]] = None - download_urls: Optional[List[str]] = field(default_factory=list) + download_urls: List[str] = field(default_factory=list) class BookQueue: """Thread-safe book queue manager.""" @@ -62,11 +62,11 @@ def update_status(self, book_id: str, status: QueueStatus) -> None: with self._lock: self._update_status(book_id, status) - def get_status(self) -> Dict[str, Dict[str, BookInfo]]: + def get_status(self) -> Dict[QueueStatus, Dict[str, BookInfo]]: """Get current queue status.""" self.refresh() with self._lock: - result = {status: {} for status in QueueStatus} + result: Dict[QueueStatus, Dict[str, BookInfo]] = {status: {} for status in QueueStatus} for book_id, status in self._status.items(): if book_id in self._book_data: result[status][book_id] = self._book_data[book_id] diff --git a/network.py b/network.py index cf03ada..ebfecc8 100644 --- a/network.py +++ b/network.py @@ -25,7 +25,7 @@ def setup_urllib_opener(): setup_urllib_opener() -def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> Optional[str]: +def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> str: """Fetch HTML content from a URL with retry mechanism. Args: @@ -39,15 +39,7 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip try: logger.info(f"GET: {url}") response = requests.get(url) - - if skip_404 and response.status_code == 404: - logger.warning(f"404 error for URL: {url}") - return None - - if skip_403 and response.status_code == 403: - logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.") - return None - + response.raise_for_status() time.sleep(1) return response.text @@ -55,7 +47,16 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip except requests.exceptions.RequestException as e: if retry == 0: logger.error(f"Failed to fetch page: {url}, error: {e}") - return None + return "" + + if skip_404 and response.status_code == 404: + logger.warning(f"404 error for URL: {url}") + return "" + + if skip_403 and response.status_code == 403: + logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.") + return "" + sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1) logger.warning( @@ -64,7 +65,7 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip time.sleep(sleep_time) return html_get_page(url, retry - 1) -def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]: +def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> str: """Fetch HTML content through Cloudflare proxy. Args: @@ -88,7 +89,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]: except Exception as e: if retry == 0: logger.error(f"Failed to fetch page through CF: {url}, error: {e}") - return None + return "" sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1) logger.warning( @@ -97,7 +98,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]: time.sleep(sleep_time) return html_get_page_cf(url, retry - 1) -def download_url(link: str, size: str = None) -> Optional[BytesIO]: +def download_url(link: str, size: str = "") -> Optional[BytesIO]: """Download content from URL into a BytesIO buffer. Args: @@ -111,19 +112,23 @@ def download_url(link: str, size: str = None) -> Optional[BytesIO]: response = requests.get(link, stream=True) response.raise_for_status() + total_size : float = 0.0 try: - total_size = size.strip().replace(" ", "").replace(",", ".").upper() # we assume size is in MB - total_size = int(float(total_size[:-2].strip()) * 1024 * 1024) + total_size = float(size.strip().replace(" ", "").replace(",", ".").upper()[:-2].strip()) * 1024 * 1024 except: - total_size = int(response.headers.get('content-length', 0)) + total_size = float(response.headers.get('content-length', 0)) buffer = BytesIO() - for chunk in tqdm(response.iter_content(chunk_size=1024), total=total_size, unit='B', unit_scale=True, unit_divisor=1024): + + # Initialize the progress bar with your guess + pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') + for chunk in response.iter_content(chunk_size=1000): buffer.write(chunk) - buffer.seek(0) + pbar.update(len(chunk)) + + pbar.close() return buffer - except requests.exceptions.RequestException as e: logger.error(f"Failed to download from {link}: {e}") return None @@ -135,8 +140,8 @@ def get_absolute_url(base_url: str, url: str) -> str: base_url: Base URL url: Relative URL """ - if url == None or url.strip() == "": - return None + if url.strip() == "": + return "" if url.startswith("http"): return url parsed_url = urlparse(url) diff --git a/requirements.txt b/requirements.txt index 49a6329..aad7e15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,6 @@ flask requests beautifulsoup4 tqdm +types-requests +types-beautifulsoup4 +types-tqdm \ No newline at end of file