[FIX] Refactor code to fix typing (#23)

This also fixes some bug with the CF bypasser and the file checker
calibrain · Dec 23, 2024 · b8a7247 · b8a7247
1 parent 7597054
commit b8a7247
Show file tree

Hide file tree

Showing 10 changed files with 107 additions and 118 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -17,6 +17,8 @@ ENV CLOUDFLARE_PROXY_URL=http://localhost:8000
 ENV INGEST_DIR=/cwa-book-ingest
 ENV STATUS_TIMEOUT=3600
 ENV PYTHONPATH=/app
+ENV USE_CF_BYPASS=true
+ENV AA_BASE_URL=https://annas-archive.org
 
 # Default UID and GID (can be overridden at runtime)
 ENV UID=1000

diff --git a/app.py b/app.py
@@ -12,7 +12,7 @@
 
 logger = setup_logger(__name__)
 app = Flask(__name__)
-app.wsgi_app = ProxyFix(app.wsgi_app)
+app.wsgi_app = ProxyFix(app.wsgi_app)  # type: ignore
 app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0  # Disable caching
 app.config['APPLICATION_ROOT'] = '/'
 
@@ -171,12 +171,10 @@ def api_local_download():
         return jsonify({"error": "No book ID provided"}), 400
 
     try:
-        file_data = backend.get_book_data(book_id)
+        file_data, file_name = backend.get_book_data(book_id)
         if file_data is None:
             # Book data not found or not available
             return jsonify({"error": "File not found"}), 404
-
-        file_data, file_name = file_data
         # Santize the file name
         file_name = re.sub(r'[\\/:*?"<>|]', '_', file_name.strip())[:255]
         # Prepare the file for sending to the client

diff --git a/backend.py b/backend.py
@@ -3,7 +3,7 @@
 import threading, time
 import subprocess
 from pathlib import Path
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Tuple
 
 from logger import setup_logger
 from config import TMP_DIR, MAIN_LOOP_SLEEP_TIME, INGEST_DIR
@@ -75,14 +75,14 @@ def queue_status() -> Dict[str, Dict[str, Any]]:
         for status_type, books in status.items()
     }
 
-def get_book_data(book_id: str) -> Optional[bytes]:
-    """Get book data for a specific book.
+def get_book_data(book_id: str) -> Tuple[Optional[bytes], str] :
+    """Get book data for a specific book, including its title.
     
     Args:
         book_id: Book identifier
         
     Returns:
-        Optional[bytes]: Book data if available
+        Tuple[Optional[bytes], str]: Book data if available, and the book title
     """
     try:
         book_info = book_queue._book_data[book_id]
@@ -91,7 +91,7 @@ def get_book_data(book_id: str) -> Optional[bytes]:
             return f.read(), book_info.title
     except Exception as e:
         logger.error(f"Error getting book data: {e}")
-        return None
+        return None, ""
 
 def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]:
     """Convert BookInfo object to dictionary representation."""
@@ -110,12 +110,14 @@ def _process_book(book_path: str) -> bool:
         bool: True if book is valid
     """
     try:
+        logger.info(f"Verifying book health: {book_path}")
         script_path = Path(__file__).parent / "check_health.sh"
         result = subprocess.run(
             [str(script_path), book_path],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE
         )
+        logger.info(f"Health check result: {result.stdout.decode()}")
         return result.returncode == 0
     except Exception as e:
         logger.error(f"Error checking book health: {e}")
@@ -132,15 +134,10 @@ def _download_book(book_id: str) -> bool:
     """
     try:
         book_info = book_queue._book_data[book_id]
-        data = book_manager.download_book(book_info)
-
-        if not data:
-            raise Exception("No data received")
-
         book_path = TMP_DIR / f"{book_id}.{book_info.format}"
-        with open(book_path, "wb") as f:
-            f.write(data.getbuffer())
-
+        success = book_manager.download_book(book_info, book_path)
+        if not success:
+            raise Exception("Unkown error downloading book")
         return _process_book(str(book_path))
 
     except Exception as e:

diff --git a/book_manager.py b/book_manager.py
@@ -1,11 +1,11 @@
 """Book download manager handling search and retrieval operations."""
 
-import time
+import time, json
+from pathlib import Path
 from urllib.parse import quote
 from typing import List, Optional, Dict
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag, NavigableString
 from io import BytesIO
-import json
 
 from logger import setup_logger
 from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS
@@ -43,20 +43,21 @@ def search_books(query: str) -> List[BookInfo]:
         raise Exception("No books found. Please try another query.")
 
     soup = BeautifulSoup(html, 'html.parser')
-    tbody = soup.find('table')
+    tbody: Tag | NavigableString | None = soup.find('table')
 
     if not tbody:
         logger.warning(f"No results table found for query: {query}")
         raise Exception("No books found. Please try another query.")
 
     books = []
-    for line_tr in tbody.find_all('tr'):
-        try:
-            book = _parse_search_result_row(line_tr)
-            if book:
-                books.append(book)
-        except Exception as e:
-            logger.error(f"Failed to parse search result row: {e}")
+    if  isinstance(tbody, Tag):
+        for line_tr in tbody.find_all('tr'):
+            try:
+                book = _parse_search_result_row(line_tr)
+                if book:
+                    books.append(book)
+            except Exception as e:
+                logger.error(f"Failed to parse search result row: {e}")
 
     books.sort(
         key=lambda x: (
@@ -116,9 +117,17 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
     if not data:
         raise Exception(f"Failed to parse book info for ID: {book_id}")
 
-    preview = data.select_one(
+    preview: str = ""
+
+    node = data.select_one(
         'div:nth-of-type(1) > img'
-    )['src']
+    )
+    if node:
+        preview_value = node.get('src', "")
+        if isinstance(preview_value, list):
+            preview = preview_value[0]
+        else:
+            preview  = preview_value
 
     # Find the start of book information
     divs = data.find_all('div')
@@ -164,17 +173,12 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
                         external_urls_z_lib.add(url['href'])
         except:
             pass
-
-
-    slow_urls_no_waitlist = list(slow_urls_no_waitlist)
-    slow_urls_with_waitlist = list(slow_urls_with_waitlist)
-    external_urls_libgen = list(external_urls_libgen)
-    external_urls_z_lib = list(external_urls_z_lib)
 
     if USE_CF_BYPASS:
-        urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib
+        urls = list(slow_urls_no_waitlist) + list(external_urls_libgen) + list(slow_urls_with_waitlist) + list(external_urls_z_lib)
     else:
-        urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist
+        urls = list(external_urls_libgen) + list(external_urls_z_lib) + list(slow_urls_no_waitlist) + list(slow_urls_with_waitlist)
+
     for i in range(len(urls)):
         urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i])
 
@@ -204,7 +208,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
 
 def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
     """Extract metadata from book info divs."""
-    info = {}
+    info : Dict[str, List[str]] = {}
 
     # Process the first set of metadata
     sub_data = metadata_divs[0].find_all('div')
@@ -239,7 +243,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
         and "filename" not in k.lower()
     }
 
-def download_book(book_info: BookInfo) -> Optional[BytesIO]:
+def download_book(book_info: BookInfo, book_path: Path) -> bool:
     """Download a book from available sources.
     
     Args:
@@ -250,48 +254,53 @@ def download_book(book_info: BookInfo) -> Optional[BytesIO]:
         Optional[BytesIO]: Book content buffer if successful
     """
 
-
-
     if len(book_info.download_urls) == 0:
         book_info = get_book_info(book_info.id)
     download_links = book_info.download_urls
 
     # If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources.
-    if AA_DONATOR_KEY is not None:
+    if AA_DONATOR_KEY != "":
         download_links.insert(0, 
             f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}"
         )
 
     for link in download_links:
         try:
             download_url = _get_download_url(link, book_info.title)
-            if download_url:
+            if download_url != "":
                 logger.info(f"Downloading {book_info.title} from {download_url}")
-                return network.download_url(download_url, book_info.size)
+                data = network.download_url(download_url, book_info.size or "")
+                if not data:
+                    raise Exception("No data received")
+
+                logger.info(f"Download finished. Writing to {book_path}")
+                with open(book_path, "wb") as f:
+                    f.write(data.getbuffer())
+                logger.info(f"Writing {book_info.title} successfully")
+                return True
+
         except Exception as e:
             logger.error(f"Failed to download from {link}: {e}")
             continue
 
-    return None
+    return False
 
-def _get_download_url(link: str, title: str) -> Optional[str]:
+def _get_download_url(link: str, title: str) -> str:
     """Extract actual download URL from various source pages."""
 
     if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"):
         page = network.html_get_page(link)
         return json.loads(page).get("download_url")
 
-
-    try:
-        html = network.html_get_page(link, retry=0, skip_403=True)
-    except:
+    html = network.html_get_page(link, retry=0, skip_403=True)
+    if html == "":
         html = network.html_get_page_cf(link)
 
-    if not html:
-        return None
+    if html == "":
+        return ""
 
     soup = BeautifulSoup(html, 'html.parser')
-    url = None
+    url = ""
 
     if link.startswith("https://z-lib.gs"):
         download_link = soup.find_all('a', href=True, class_="addDownloadedBook")

diff --git a/check_health.sh b/check_health.sh
@@ -35,30 +35,11 @@ for file in "${files[@]}"; do
     fileextension="${filenamewithext##*.}"
 
     case "$fileextension" in
-        epub)
-            # Check if the EPUB file is a valid archive
-            7z t "$file" >/dev/null 2>&1;
-            exit_code=$?
-            if [ "$exit_code" -eq 0 ] || [ "$exit_code" -eq 1 ]; then
-                mv "$file" "$OUTPUTFOLDER/$filenamewithext"
-                good=$((good + 1))
-            else
-                ebook-convert "$file" /tmp/tmpepub.epub >/dev/null 2>&1
-                exit_code=$?
-                rm -f /tmp/tmpepub.epub 
-                if [ "$exit_code" -eq 0 ]; then
-                    mv "$file" "$OUTPUTFOLDER/$filenamewithext"
-                    good=$((good + 1))
-                else
-                    rm "$file"
-                    bad=$((bad + 1))
-                fi
-            fi
-            ;;
-        mobi|azw3|fb2|djvu|cbz|cbr)
+        epub|mobi|azw3|fb2|djvu|cbz|cbr)
             # Attempt to convert the file to EPUB
             ebook-convert "$file" "$OUTPUTFOLDER/$filename.epub" >/dev/null 2>&1
-            if [ "$exit_code" -eq 0 ]; then
+            # if file exists in $OUTPUTFOLDER/$filename.epub then it is a good file
+            if [ -f "$OUTPUTFOLDER/$filename.epub" ]; then
                 good=$((good + 1))
             else
                 bad=$((bad + 1))
@@ -82,6 +63,6 @@ if [ "$bad" -gt 0 ]; then
     exit 2
 fi
 if [ "$manual" -gt 0 ]; then
-    exut 1
+    exit 1
 fi
 exit 0
diff --git a/config.py b/config.py
@@ -7,14 +7,11 @@
 
 # Directory settings
 BASE_DIR = Path(__file__).resolve().parent
-LOG_DIR =  "/var/logs"
-LOG_DIR = Path(LOG_DIR)
+LOG_DIR = Path("/var/logs")
 
-TMP_DIR = os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")
-TMP_DIR = Path(TMP_DIR)
+TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader"))
 
-INGEST_DIR = os.getenv("INGEST_DIR", "/cwa-book-ingest")
-INGEST_DIR = Path(INGEST_DIR)
+INGEST_DIR = Path(os.getenv("INGEST_DIR", "/tmp/cwa-book-ingest"))
 STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", 3600))
 
 # Create necessary directories
@@ -26,19 +23,16 @@
 MAX_RETRY = int(os.getenv("MAX_RETRY", 3))
 DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5))
 CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000")
-USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower()
-USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"]
+USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "true").lower() in ["true", "yes", "1", "y"]
 
 # Anna's Archive settings
-AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None)
+AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", "").strip()
 AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/")
 
 # File format settings
-SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr")
-SUPPORTED_FORMATS = SUPPORTED_FORMATS.split(",")
+SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr").split(",")
 
-BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en")
-BOOK_LANGUAGE = BOOK_LANGUAGE.lower().split(',')
+BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en").lower().split(',')
 BOOK_LANGUAGE = [l for l in BOOK_LANGUAGE if l in _SUPPORTED_BOOK_LANGUAGE]
 if len(BOOK_LANGUAGE) == 0:
     BOOK_LANGUAGE = ['en']

diff --git a/logger.py b/logger.py
@@ -5,7 +5,7 @@
 from logging.handlers import RotatingFileHandler
 from config import FLASK_DEBUG
 
-def setup_logger(name: str, log_file: str = None) -> logging.Logger:
+def setup_logger(name: str, log_file: str = "") -> logging.Logger:
     """Set up and configure a logger instance.
     
     Args:
@@ -39,9 +39,9 @@ def setup_logger(name: str, log_file: str = None) -> logging.Logger:
     logger.addHandler(error_handler)
 
     # File handler if log file is specified
-    if log_file:
+    if log_file.strip() != "":
         file_handler = RotatingFileHandler(
-            log_file,
+            log_file.strip(),
             maxBytes=10485760,  # 10MB
             backupCount=5
         )