Skip to content

Commit

Permalink
[FIX] Refactor code to fix typing (#23)
Browse files Browse the repository at this point in the history
This also fixes some bug with the CF bypasser and the file checker
  • Loading branch information
calibrain authored Dec 23, 2024
1 parent 7597054 commit b8a7247
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 118 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ ENV CLOUDFLARE_PROXY_URL=http://localhost:8000
ENV INGEST_DIR=/cwa-book-ingest
ENV STATUS_TIMEOUT=3600
ENV PYTHONPATH=/app
ENV USE_CF_BYPASS=true
ENV AA_BASE_URL=https://annas-archive.org

# Default UID and GID (can be overridden at runtime)
ENV UID=1000
Expand Down
6 changes: 2 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

logger = setup_logger(__name__)
app = Flask(__name__)
app.wsgi_app = ProxyFix(app.wsgi_app)
app.wsgi_app = ProxyFix(app.wsgi_app) # type: ignore
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 # Disable caching
app.config['APPLICATION_ROOT'] = '/'

Expand Down Expand Up @@ -171,12 +171,10 @@ def api_local_download():
return jsonify({"error": "No book ID provided"}), 400

try:
file_data = backend.get_book_data(book_id)
file_data, file_name = backend.get_book_data(book_id)
if file_data is None:
# Book data not found or not available
return jsonify({"error": "File not found"}), 404

file_data, file_name = file_data
# Santize the file name
file_name = re.sub(r'[\\/:*?"<>|]', '_', file_name.strip())[:255]
# Prepare the file for sending to the client
Expand Down
23 changes: 10 additions & 13 deletions backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import threading, time
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Any
from typing import Dict, List, Optional, Any, Tuple

from logger import setup_logger
from config import TMP_DIR, MAIN_LOOP_SLEEP_TIME, INGEST_DIR
Expand Down Expand Up @@ -75,14 +75,14 @@ def queue_status() -> Dict[str, Dict[str, Any]]:
for status_type, books in status.items()
}

def get_book_data(book_id: str) -> Optional[bytes]:
"""Get book data for a specific book.
def get_book_data(book_id: str) -> Tuple[Optional[bytes], str] :
"""Get book data for a specific book, including its title.
Args:
book_id: Book identifier
Returns:
Optional[bytes]: Book data if available
Tuple[Optional[bytes], str]: Book data if available, and the book title
"""
try:
book_info = book_queue._book_data[book_id]
Expand All @@ -91,7 +91,7 @@ def get_book_data(book_id: str) -> Optional[bytes]:
return f.read(), book_info.title
except Exception as e:
logger.error(f"Error getting book data: {e}")
return None
return None, ""

def _book_info_to_dict(book: BookInfo) -> Dict[str, Any]:
"""Convert BookInfo object to dictionary representation."""
Expand All @@ -110,12 +110,14 @@ def _process_book(book_path: str) -> bool:
bool: True if book is valid
"""
try:
logger.info(f"Verifying book health: {book_path}")
script_path = Path(__file__).parent / "check_health.sh"
result = subprocess.run(
[str(script_path), book_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
logger.info(f"Health check result: {result.stdout.decode()}")
return result.returncode == 0
except Exception as e:
logger.error(f"Error checking book health: {e}")
Expand All @@ -132,15 +134,10 @@ def _download_book(book_id: str) -> bool:
"""
try:
book_info = book_queue._book_data[book_id]
data = book_manager.download_book(book_info)

if not data:
raise Exception("No data received")

book_path = TMP_DIR / f"{book_id}.{book_info.format}"
with open(book_path, "wb") as f:
f.write(data.getbuffer())

success = book_manager.download_book(book_info, book_path)
if not success:
raise Exception("Unkown error downloading book")
return _process_book(str(book_path))

except Exception as e:
Expand Down
83 changes: 46 additions & 37 deletions book_manager.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Book download manager handling search and retrieval operations."""

import time
import time, json
from pathlib import Path
from urllib.parse import quote
from typing import List, Optional, Dict
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag, NavigableString
from io import BytesIO
import json

from logger import setup_logger
from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS
Expand Down Expand Up @@ -43,20 +43,21 @@ def search_books(query: str) -> List[BookInfo]:
raise Exception("No books found. Please try another query.")

soup = BeautifulSoup(html, 'html.parser')
tbody = soup.find('table')
tbody: Tag | NavigableString | None = soup.find('table')

if not tbody:
logger.warning(f"No results table found for query: {query}")
raise Exception("No books found. Please try another query.")

books = []
for line_tr in tbody.find_all('tr'):
try:
book = _parse_search_result_row(line_tr)
if book:
books.append(book)
except Exception as e:
logger.error(f"Failed to parse search result row: {e}")
if isinstance(tbody, Tag):
for line_tr in tbody.find_all('tr'):
try:
book = _parse_search_result_row(line_tr)
if book:
books.append(book)
except Exception as e:
logger.error(f"Failed to parse search result row: {e}")

books.sort(
key=lambda x: (
Expand Down Expand Up @@ -116,9 +117,17 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
if not data:
raise Exception(f"Failed to parse book info for ID: {book_id}")

preview = data.select_one(
preview: str = ""

node = data.select_one(
'div:nth-of-type(1) > img'
)['src']
)
if node:
preview_value = node.get('src', "")
if isinstance(preview_value, list):
preview = preview_value[0]
else:
preview = preview_value

# Find the start of book information
divs = data.find_all('div')
Expand Down Expand Up @@ -164,17 +173,12 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
external_urls_z_lib.add(url['href'])
except:
pass


slow_urls_no_waitlist = list(slow_urls_no_waitlist)
slow_urls_with_waitlist = list(slow_urls_with_waitlist)
external_urls_libgen = list(external_urls_libgen)
external_urls_z_lib = list(external_urls_z_lib)

if USE_CF_BYPASS:
urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib
urls = list(slow_urls_no_waitlist) + list(external_urls_libgen) + list(slow_urls_with_waitlist) + list(external_urls_z_lib)
else:
urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist
urls = list(external_urls_libgen) + list(external_urls_z_lib) + list(slow_urls_no_waitlist) + list(slow_urls_with_waitlist)

for i in range(len(urls)):
urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i])

Expand Down Expand Up @@ -204,7 +208,7 @@ def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:

def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
"""Extract metadata from book info divs."""
info = {}
info : Dict[str, List[str]] = {}

# Process the first set of metadata
sub_data = metadata_divs[0].find_all('div')
Expand Down Expand Up @@ -239,7 +243,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
and "filename" not in k.lower()
}

def download_book(book_info: BookInfo) -> Optional[BytesIO]:
def download_book(book_info: BookInfo, book_path: Path) -> bool:
"""Download a book from available sources.
Args:
Expand All @@ -250,48 +254,53 @@ def download_book(book_info: BookInfo) -> Optional[BytesIO]:
Optional[BytesIO]: Book content buffer if successful
"""



if len(book_info.download_urls) == 0:
book_info = get_book_info(book_info.id)
download_links = book_info.download_urls

# If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources.
if AA_DONATOR_KEY is not None:
if AA_DONATOR_KEY != "":
download_links.insert(0,
f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}"
)

for link in download_links:
try:
download_url = _get_download_url(link, book_info.title)
if download_url:
if download_url != "":
logger.info(f"Downloading {book_info.title} from {download_url}")
return network.download_url(download_url, book_info.size)
data = network.download_url(download_url, book_info.size or "")
if not data:
raise Exception("No data received")

logger.info(f"Download finished. Writing to {book_path}")
with open(book_path, "wb") as f:
f.write(data.getbuffer())
logger.info(f"Writing {book_info.title} successfully")
return True

except Exception as e:
logger.error(f"Failed to download from {link}: {e}")
continue

return None
return False

def _get_download_url(link: str, title: str) -> Optional[str]:
def _get_download_url(link: str, title: str) -> str:
"""Extract actual download URL from various source pages."""

if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"):
page = network.html_get_page(link)
return json.loads(page).get("download_url")


try:
html = network.html_get_page(link, retry=0, skip_403=True)
except:
html = network.html_get_page(link, retry=0, skip_403=True)
if html == "":
html = network.html_get_page_cf(link)

if not html:
return None
if html == "":
return ""

soup = BeautifulSoup(html, 'html.parser')
url = None
url = ""

if link.startswith("https://z-lib.gs"):
download_link = soup.find_all('a', href=True, class_="addDownloadedBook")
Expand Down
27 changes: 4 additions & 23 deletions check_health.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,11 @@ for file in "${files[@]}"; do
fileextension="${filenamewithext##*.}"

case "$fileextension" in
epub)
# Check if the EPUB file is a valid archive
7z t "$file" >/dev/null 2>&1;
exit_code=$?
if [ "$exit_code" -eq 0 ] || [ "$exit_code" -eq 1 ]; then
mv "$file" "$OUTPUTFOLDER/$filenamewithext"
good=$((good + 1))
else
ebook-convert "$file" /tmp/tmpepub.epub >/dev/null 2>&1
exit_code=$?
rm -f /tmp/tmpepub.epub
if [ "$exit_code" -eq 0 ]; then
mv "$file" "$OUTPUTFOLDER/$filenamewithext"
good=$((good + 1))
else
rm "$file"
bad=$((bad + 1))
fi
fi
;;
mobi|azw3|fb2|djvu|cbz|cbr)
epub|mobi|azw3|fb2|djvu|cbz|cbr)
# Attempt to convert the file to EPUB
ebook-convert "$file" "$OUTPUTFOLDER/$filename.epub" >/dev/null 2>&1
if [ "$exit_code" -eq 0 ]; then
# if file exists in $OUTPUTFOLDER/$filename.epub then it is a good file
if [ -f "$OUTPUTFOLDER/$filename.epub" ]; then
good=$((good + 1))
else
bad=$((bad + 1))
Expand All @@ -82,6 +63,6 @@ if [ "$bad" -gt 0 ]; then
exit 2
fi
if [ "$manual" -gt 0 ]; then
exut 1
exit 1
fi
exit 0
20 changes: 7 additions & 13 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,11 @@

# Directory settings
BASE_DIR = Path(__file__).resolve().parent
LOG_DIR = "/var/logs"
LOG_DIR = Path(LOG_DIR)
LOG_DIR = Path("/var/logs")

TMP_DIR = os.getenv("TMP_DIR", "/tmp/cwa-book-downloader")
TMP_DIR = Path(TMP_DIR)
TMP_DIR = Path(os.getenv("TMP_DIR", "/tmp/cwa-book-downloader"))

INGEST_DIR = os.getenv("INGEST_DIR", "/cwa-book-ingest")
INGEST_DIR = Path(INGEST_DIR)
INGEST_DIR = Path(os.getenv("INGEST_DIR", "/tmp/cwa-book-ingest"))
STATUS_TIMEOUT = int(os.getenv("STATUS_TIMEOUT", 3600))

# Create necessary directories
Expand All @@ -26,19 +23,16 @@
MAX_RETRY = int(os.getenv("MAX_RETRY", 3))
DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5))
CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000")
USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower()
USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"]
USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "true").lower() in ["true", "yes", "1", "y"]

# Anna's Archive settings
AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None)
AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", "").strip()
AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/")

# File format settings
SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr")
SUPPORTED_FORMATS = SUPPORTED_FORMATS.split(",")
SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr").split(",")

BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en")
BOOK_LANGUAGE = BOOK_LANGUAGE.lower().split(',')
BOOK_LANGUAGE = os.getenv("BOOK_LANGUAGE", "en").lower().split(',')
BOOK_LANGUAGE = [l for l in BOOK_LANGUAGE if l in _SUPPORTED_BOOK_LANGUAGE]
if len(BOOK_LANGUAGE) == 0:
BOOK_LANGUAGE = ['en']
Expand Down
6 changes: 3 additions & 3 deletions logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from logging.handlers import RotatingFileHandler
from config import FLASK_DEBUG

def setup_logger(name: str, log_file: str = None) -> logging.Logger:
def setup_logger(name: str, log_file: str = "") -> logging.Logger:
"""Set up and configure a logger instance.
Args:
Expand Down Expand Up @@ -39,9 +39,9 @@ def setup_logger(name: str, log_file: str = None) -> logging.Logger:
logger.addHandler(error_handler)

# File handler if log file is specified
if log_file:
if log_file.strip() != "":
file_handler = RotatingFileHandler(
log_file,
log_file.strip(),
maxBytes=10485760, # 10MB
backupCount=5
)
Expand Down
Loading

0 comments on commit b8a7247

Please sign in to comment.