From b8a72475381c9245dfccac539b4e882f26d74e8b Mon Sep 17 00:00:00 2001
From: CaliBrain <calibrain@l4n.xyz>
Date: Sun, 22 Dec 2024 22:11:35 -0500
Subject: [PATCH] [FIX] Refactor code to fix typing (#23)

This also fixes some bug with the CF bypasser and the file checker
---
 Dockerfile       |  2 ++
 app.py           |  6 ++--
 backend.py       | 23 ++++++--------
 book_manager.py  | 83 +++++++++++++++++++++++++++---------------------
 check_health.sh  | 27 +++-------------
 config.py        | 20 ++++--------
 logger.py        |  6 ++--
 models.py        |  6 ++--
 network.py       | 49 +++++++++++++++-------------
 requirements.txt |  3 ++
 10 files changed, 107 insertions(+), 118 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ef04af7..ce4dfa7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,6 +17,8 @@ ENV CLOUDFLARE_PROXY_URL=http://localhost:8000
 ENV INGEST_DIR=/cwa-book-ingest
 ENV STATUS_TIMEOUT=3600
 ENV PYTHONPATH=/app
+ENV USE_CF_BYPASS=true
+ENV AA_BASE_URL=https://annas-archive.org
 
 # Default UID and GID (can be overridden at runtime)
 ENV UID=1000
diff --git a/app.py b/app.py
index 739a9c2..09076cd 100644
--- a/app.py
+++ b/app.py
@@ -12,7 +12,7 @@
 
 logger = setup_logger(__name__)
 app = Flask(__name__)
-app.wsgi_app = ProxyFix(app.wsgi_app)
+app.wsgi_app = ProxyFix(app.wsgi_app)  # type: ignore
 app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0  # Disable caching
 app.config['APPLICATION_ROOT'] = '/'
 
@@ -171,12 +171,10 @@ def api_local_download():
         return jsonify({"error": "No book ID provided"}), 400
 
     try:
-        file_data = backend.get_book_data(book_id)
+        file_data, file_name = backend.get_book_data(book_id)
         if file_data is None:
             # Book data not found or not available
             return jsonify({"error": "File not found"}), 404
-
-        file_data, file_name = file_data
         # Santize the file name
         file_name = re.sub(r'[\\/:*?"<>|]', '_', file_name.strip())[:255]
         # Prepare the file for sending to the client
diff --git a/backend.py b/backend.py
index bd89855..4818b3a 100644
--- a/backend.py
+++ b/backend.py
@@ -3,7 +3,7 @@
 import threading, time
 import subprocess
 from pathlib import Path
-from typing import Dict, List, Optional, Any
+from typing import Dict, List, Optional, Any, Tuple
 
 from logger import setup_logger
 from config import TMP_DIR, MAIN_LOOP_SLEEP_TIME, INGEST_DIR
@@ -75,14 +75,14 @@ def queue_status() -> Dict[str, Dict[str, Any]]:
         for status_type, books in status.items()
     }
 
-def get_book_data(book_id: str) -> Optional[bytes]:
-    """Get book data for a specific book.
+def get_book_data(book_id: str) -> Tuple[Optional[bytes], str] :
+    """Get book data for a specific book, including its title.
     
     Args:
         book_id: Book identifier
         
     Returns:
-        Optional[bytes]: Book data if available
+        Tuple[Optional[bytes], str]: Book data if available, and the book title
     """
     try:
         book_info = book_queue._book_data[book_id]
@@ -91,7 +91,7 @@ def get_book_data(book_id: str) -> Optional[bytes]:
             return f.read(), book_info.title
     except Exception as e:
         logger.error(f"Error getting book data: {e}")
-        return None
+        return None, ""
 
 def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]:
     """Convert BookInfo object to dictionary representation."""
@@ -110,12 +110,14 @@ def _process_book(book_path: str) -> bool:
         bool: True if book is valid
     """
     try:
+        logger.info(f"Verifying book health: {book_path}")
         script_path = Path(__file__).parent / "check_health.sh"
         result = subprocess.run(
             [str(script_path), book_path],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE
         )
+        logger.info(f"Health check result: {result.stdout.decode()}")
         return result.returncode == 0
     except Exception as e:
         logger.error(f"Error checking book health: {e}")
@@ -132,15 +134,10 @@ def _download_book(book_id: str) -> bool:
     """
     try:
         book_info = book_queue._book_data[book_id]
-        data = book_manager.download_book(book_info)
-        
-        if not data:
-            raise Exception("No data received")
-            
         book_path = TMP_DIR / f"{book_id}.{book_info.format}"
-        with open(book_path, "wb") as f:
-            f.write(data.getbuffer())
-            
+        success = book_manager.download_book(book_info, book_path)
+        if not success:
+            raise Exception("Unkown error downloading book")
         return _process_book(str(book_path))
         
     except Exception as e:
diff --git a/book_manager.py b/book_manager.py
index 214f54a..3773f3c 100644
--- a/book_manager.py
+++ b/book_manager.py
@@ -1,11 +1,11 @@
 """Book download manager handling search and retrieval operations."""
 
-import time
+import time, json
+from pathlib import Path
 from urllib.parse import quote
 from typing import List, Optional, Dict
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag, NavigableString
 from io import BytesIO
-import json
 
 from logger import setup_logger
 from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS
@@ -43,20 +43,21 @@ def search_books(query: str) -> List[BookInfo]:
         raise Exception("No books found. Please try another query.")
 
     soup = BeautifulSoup(html, 'html.parser')
-    tbody = soup.find('table')
+    tbody: Tag | NavigableString | None = soup.find('table')
     
     if not tbody:
         logger.warning(f"No results table found for query: {query}")
         raise Exception("No books found. Please try another query.")
 
     books = []
-    for line_tr in tbody.find_all('tr'):
-        try:
-            book = _parse_search_result_row(line_tr)
-            if book:
-                books.append(book)
-        except Exception as e:
-            logger.error(f"Failed to parse search result row: {e}")
+    if  isinstance(tbody, Tag):
+        for line_tr in tbody.find_all('tr'):
+            try:
+                book = _parse_search_result_row(line_tr)
+                if book:
+                    books.append(book)
+            except Exception as e:
+                logger.error(f"Failed to parse search result row: {e}")
 
     books.sort(
         key=lambda x: (
@@ -116,9 +117,17 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
     if not data:
         raise Exception(f"Failed to parse book info for ID: {book_id}")
     
-    preview = data.select_one(
+    preview: str = ""
+
+    node = data.select_one(
         'div:nth-of-type(1) > img'
-    )['src']
+    )
+    if node:
+        preview_value = node.get('src', "")
+        if isinstance(preview_value, list):
+            preview = preview_value[0]
+        else:
+            preview  = preview_value
 
     # Find the start of book information
     divs = data.find_all('div')
@@ -164,17 +173,12 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
                         external_urls_z_lib.add(url['href'])
         except:
             pass
-        
-    
-    slow_urls_no_waitlist = list(slow_urls_no_waitlist)
-    slow_urls_with_waitlist = list(slow_urls_with_waitlist)
-    external_urls_libgen = list(external_urls_libgen)
-    external_urls_z_lib = list(external_urls_z_lib)
 
     if USE_CF_BYPASS:
-        urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib
+        urls = list(slow_urls_no_waitlist) + list(external_urls_libgen) + list(slow_urls_with_waitlist) + list(external_urls_z_lib)
     else:
-        urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist
+        urls = list(external_urls_libgen) + list(external_urls_z_lib) + list(slow_urls_no_waitlist) + list(slow_urls_with_waitlist)
+
     for i in range(len(urls)):
         urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i])
 
@@ -204,7 +208,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
 
 def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
     """Extract metadata from book info divs."""
-    info = {}
+    info : Dict[str, List[str]] = {}
     
     # Process the first set of metadata
     sub_data = metadata_divs[0].find_all('div')
@@ -239,7 +243,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
         and "filename" not in k.lower()
     }
 
-def download_book(book_info: BookInfo) -> Optional[BytesIO]:
+def download_book(book_info: BookInfo, book_path: Path) -> bool:
     """Download a book from available sources.
     
     Args:
@@ -250,14 +254,12 @@ def download_book(book_info: BookInfo) -> Optional[BytesIO]:
         Optional[BytesIO]: Book content buffer if successful
     """
 
-
-
     if len(book_info.download_urls) == 0:
         book_info = get_book_info(book_info.id)
     download_links = book_info.download_urls
 
     # If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources.
-    if AA_DONATOR_KEY is not None:
+    if AA_DONATOR_KEY != "":
         download_links.insert(0, 
             f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}"
         )
@@ -265,33 +267,40 @@ def download_book(book_info: BookInfo) -> Optional[BytesIO]:
     for link in download_links:
         try:
             download_url = _get_download_url(link, book_info.title)
-            if download_url:
+            if download_url != "":
                 logger.info(f"Downloading {book_info.title} from {download_url}")
-                return network.download_url(download_url, book_info.size)
+                data = network.download_url(download_url, book_info.size or "")
+                if not data:
+                    raise Exception("No data received")
+
+                logger.info(f"Download finished. Writing to {book_path}")
+                with open(book_path, "wb") as f:
+                    f.write(data.getbuffer())
+                logger.info(f"Writing {book_info.title} successfully")
+                return True
+            
         except Exception as e:
             logger.error(f"Failed to download from {link}: {e}")
             continue
     
-    return None
+    return False
 
-def _get_download_url(link: str, title: str) -> Optional[str]:
+def _get_download_url(link: str, title: str) -> str:
     """Extract actual download URL from various source pages."""
 
     if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"):
         page = network.html_get_page(link)
         return json.loads(page).get("download_url")
     
-
-    try:
-        html = network.html_get_page(link, retry=0, skip_403=True)
-    except:
+    html = network.html_get_page(link, retry=0, skip_403=True)
+    if html == "":
         html = network.html_get_page_cf(link)
     
-    if not html:
-        return None
+    if html == "":
+        return ""
     
     soup = BeautifulSoup(html, 'html.parser')
-    url = None
+    url = ""
     
     if link.startswith("https://z-lib.gs"):
         download_link = soup.find_all('a', href=True, class_="addDownloadedBook")
diff --git a/check_health.sh b/check_health.sh
index 9ca7c92..774e5a2 100644
--- a/check_health.sh
+++ b/check_health.sh
@@ -35,30 +35,11 @@ for file in "${files[@]}"; do
     fileextension="${filenamewithext##*.}"
 
     case "$fileextension" in
-        epub)
-            # Check if the EPUB file is a valid archive
-            7z t "$file" >/dev/null 2>&1;
-            exit_code=$?
-            if [ "$exit_code" -eq 0 ] || [ "$exit_code" -eq 1 ]; then
-                mv "$file" "$OUTPUTFOLDER/$filenamewithext"
-                good=$((good + 1))
-            else
-                ebook-convert "$file" /tmp/tmpepub.epub >/dev/null 2>&1
-                exit_code=$?
-                rm -f /tmp/tmpepub.epub 
-                if [ "$exit_code" -eq 0 ]; then
-                    mv "$file" "$OUTPUTFOLDER/$filenamewithext"
-                    good=$((good + 1))
-                else
-                    rm "$file"
-                    bad=$((bad + 1))
-                fi
-            fi
-            ;;
-        mobi|azw3|fb2|djvu|cbz|cbr)
+        epub|mobi|azw3|fb2|djvu|cbz|cbr)
             # Attempt to convert the file to EPUB
             ebook-convert "$file" "$OUTPUTFOLDER/$filename.epub" >/dev/null 2>&1
-            if [ "$exit_code" -eq 0 ]; then
+            # if file exists in $OUTPUTFOLDER/$filename.epub then it is a good file
+            if [ -f "$OUTPUTFOLDER/$filename.epub" ]; then
                 good=$((good + 1))
             else
                 bad=$((bad + 1))
@@ -82,6 +63,6 @@ if [ "$bad" -gt 0 ]; then
     exit 2
 fi
 if [ "$manual" -gt 0 ]; then
-    exut 1
+    exit 1
 fi
 exit 0
diff --git a/config.py b/config.py
index 612bbc9..31e1a90 100644
--- a/config.py
+++ b/config.py
@@ -7,14 +7,11 @@
 
 # Directory settings
 BASE_DIR = Path(__file__).resolve().parent
-LOG_DIR =  "/var/logs"
-LOG_DIR = Path(LOG_DIR)
+LOG_DIR = Path("/var/logs")
 
-TMP_DIR = os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")
-TMP_DIR = Path(TMP_DIR)
+TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader"))
 
-INGEST_DIR = os.getenv("INGEST_DIR", "/cwa-book-ingest")
-INGEST_DIR = Path(INGEST_DIR)
+INGEST_DIR = Path(os.getenv("INGEST_DIR", "/tmp/cwa-book-ingest"))
 STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", 3600))
 
 # Create necessary directories
@@ -26,19 +23,16 @@
 MAX_RETRY = int(os.getenv("MAX_RETRY", 3))
 DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5))
 CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000")
-USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower()
-USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"]
+USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "true").lower() in ["true", "yes", "1", "y"]
 
 # Anna's Archive settings
-AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None)
+AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", "").strip()
 AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/")
 
 # File format settings
-SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr")
-SUPPORTED_FORMATS = SUPPORTED_FORMATS.split(",")
+SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr").split(",")
 
-BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en")
-BOOK_LANGUAGE = BOOK_LANGUAGE.lower().split(',')
+BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en").lower().split(',')
 BOOK_LANGUAGE = [l for l in BOOK_LANGUAGE if l in _SUPPORTED_BOOK_LANGUAGE]
 if len(BOOK_LANGUAGE) == 0:
     BOOK_LANGUAGE = ['en']
diff --git a/logger.py b/logger.py
index 432d6d8..8153f30 100644
--- a/logger.py
+++ b/logger.py
@@ -5,7 +5,7 @@
 from logging.handlers import RotatingFileHandler
 from config import FLASK_DEBUG
 
-def setup_logger(name: str, log_file: str = None) -> logging.Logger:
+def setup_logger(name: str, log_file: str = "") -> logging.Logger:
     """Set up and configure a logger instance.
     
     Args:
@@ -39,9 +39,9 @@ def setup_logger(name: str, log_file: str = None) -> logging.Logger:
     logger.addHandler(error_handler)
     
     # File handler if log file is specified
-    if log_file:
+    if log_file.strip() != "":
         file_handler = RotatingFileHandler(
-            log_file,
+            log_file.strip(),
             maxBytes=10485760,  # 10MB
             backupCount=5
         )
diff --git a/models.py b/models.py
index 340f919..e703914 100644
--- a/models.py
+++ b/models.py
@@ -27,7 +27,7 @@ class BookInfo:
     format: Optional[str] = None
     size: Optional[str] = None
     info: Optional[Dict[str, List[str]]] = None
-    download_urls: Optional[List[str]] = field(default_factory=list)
+    download_urls: List[str] = field(default_factory=list)
 
 class BookQueue:
     """Thread-safe book queue manager."""
@@ -62,11 +62,11 @@ def update_status(self, book_id: str, status: QueueStatus) -> None:
         with self._lock:
             self._update_status(book_id, status)
             
-    def get_status(self) -> Dict[str, Dict[str, BookInfo]]:
+    def get_status(self) -> Dict[QueueStatus, Dict[str, BookInfo]]:
         """Get current queue status."""
         self.refresh()
         with self._lock:
-            result = {status: {} for status in QueueStatus}
+            result: Dict[QueueStatus, Dict[str, BookInfo]] = {status: {} for status in QueueStatus}
             for book_id, status in self._status.items():
                 if book_id in self._book_data:
                     result[status][book_id] = self._book_data[book_id]
diff --git a/network.py b/network.py
index cf03ada..ebfecc8 100644
--- a/network.py
+++ b/network.py
@@ -25,7 +25,7 @@ def setup_urllib_opener():
 
 setup_urllib_opener()
 
-def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> Optional[str]:
+def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> str:
     """Fetch HTML content from a URL with retry mechanism.
     
     Args:
@@ -39,15 +39,7 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip
     try:
         logger.info(f"GET: {url}")
         response = requests.get(url)
-        
-        if skip_404 and response.status_code == 404:
-            logger.warning(f"404 error for URL: {url}")
-            return None
-        
-        if skip_403 and response.status_code == 403:
-            logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.")
-            return None
-            
+    
         response.raise_for_status()
         time.sleep(1)
         return response.text
@@ -55,7 +47,16 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip
     except requests.exceptions.RequestException as e:
         if retry == 0:
             logger.error(f"Failed to fetch page: {url}, error: {e}")
-            return None
+            return ""
+        
+        if skip_404 and response.status_code == 404:
+            logger.warning(f"404 error for URL: {url}")
+            return ""
+        
+        if skip_403 and response.status_code == 403:
+            logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.")
+            return ""
+            
             
         sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1)
         logger.warning(
@@ -64,7 +65,7 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip
         time.sleep(sleep_time)
         return html_get_page(url, retry - 1)
 
-def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
+def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> str:
     """Fetch HTML content through Cloudflare proxy.
     
     Args:
@@ -88,7 +89,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
     except Exception as e:
         if retry == 0:
             logger.error(f"Failed to fetch page through CF: {url}, error: {e}")
-            return None
+            return ""
             
         sleep_time = DEFAULT_SLEEP * (MAX_RETRY - retry + 1)
         logger.warning(
@@ -97,7 +98,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
         time.sleep(sleep_time)
         return html_get_page_cf(url, retry - 1)
 
-def download_url(link: str, size: str = None) -> Optional[BytesIO]:
+def download_url(link: str, size: str = "") -> Optional[BytesIO]:
     """Download content from URL into a BytesIO buffer.
     
     Args:
@@ -111,19 +112,23 @@ def download_url(link: str, size: str = None) -> Optional[BytesIO]:
         response = requests.get(link, stream=True)
         response.raise_for_status()
 
+        total_size : float = 0.0
         try:
-            total_size = size.strip().replace(" ", "").replace(",", ".").upper()
             # we assume size is in MB
-            total_size = int(float(total_size[:-2].strip()) * 1024 * 1024)
+            total_size = float(size.strip().replace(" ", "").replace(",", ".").upper()[:-2].strip()) * 1024 * 1024
         except:
-            total_size = int(response.headers.get('content-length', 0))
+            total_size = float(response.headers.get('content-length', 0))
         
         buffer = BytesIO()
-        for chunk in tqdm(response.iter_content(chunk_size=1024), total=total_size, unit='B', unit_scale=True, unit_divisor=1024):
+
+        # Initialize the progress bar with your guess
+        pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading')
+        for chunk in response.iter_content(chunk_size=1000):
             buffer.write(chunk)
-        buffer.seek(0)
+            pbar.update(len(chunk))
+            
+        pbar.close()
         return buffer
-        
     except requests.exceptions.RequestException as e:
         logger.error(f"Failed to download from {link}: {e}")
         return None
@@ -135,8 +140,8 @@ def get_absolute_url(base_url: str, url: str) -> str:
         base_url: Base URL
         url: Relative URL
     """
-    if url == None or url.strip() == "":
-        return None
+    if url.strip() == "":
+        return ""
     if url.startswith("http"):
         return url
     parsed_url = urlparse(url)
diff --git a/requirements.txt b/requirements.txt
index 49a6329..aad7e15 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,6 @@ flask
 requests
 beautifulsoup4
 tqdm
+types-requests
+types-beautifulsoup4
+types-tqdm
\ No newline at end of file