Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #20 : Network error when using alternative download streams #22

Merged
merged 1 commit into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def _download_book(book_id: str) -> bool:
"""
try:
book_info = book_queue._book_data[book_id]
data = book_manager.download_book(book_id, book_info.title)
data = book_manager.download_book(book_info)

if not data:
raise Exception("No data received")
Expand Down
126 changes: 79 additions & 47 deletions book_manager.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""Book download manager handling search and retrieval operations."""

import time
from urllib.parse import urlparse, quote
from urllib.parse import quote
from typing import List, Optional, Dict
from bs4 import BeautifulSoup
from io import BytesIO
import json

from logger import setup_logger
from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY
from config import SUPPORTED_FORMATS, BOOK_LANGUAGE, AA_DONATOR_KEY, AA_BASE_URL, USE_CF_BYPASS
from models import BookInfo
import network

Expand All @@ -28,7 +28,8 @@ def search_books(query: str) -> List[BookInfo]:
"""
query_html = quote(query)
url = (
f"https://annas-archive.org/search?index=&page=1&display=table"
f"{AA_BASE_URL}"
f"/search?index=&page=1&display=table"
f"&acc=aa_download&acc=external_download&sort="
f"&ext={'&ext='.join(SUPPORTED_FORMATS)}&lang={'&lang='.join(BOOK_LANGUAGE)}&q={query_html}"
)
Expand Down Expand Up @@ -98,22 +99,23 @@ def get_book_info(book_id: str) -> BookInfo:
Returns:
BookInfo: Detailed book information
"""
url = f"https://annas-archive.org/md5/{book_id}"
url = f"{AA_BASE_URL}/md5/{book_id}"
html = network.html_get_page(url)

if not html:
raise Exception(f"Failed to fetch book info for ID: {book_id}")

soup = BeautifulSoup(html, 'html.parser')

return _parse_book_info_page(soup, book_id)

def _parse_book_info_page(soup: BeautifulSoup, book_id: str) -> BookInfo:
"""Parse the book info page HTML into a BookInfo object."""
data = soup.select_one('body > main > div:nth-of-type(1)')

if not data:
raise Exception(f"Failed to parse book info for ID: {book_id}")

return _parse_book_info_page(data, book_id)

def _parse_book_info_page(data, book_id: str) -> BookInfo:
"""Parse the book info page HTML into a BookInfo object."""

preview = data.select_one(
'div:nth-of-type(1) > img'
)['src']
Expand All @@ -138,6 +140,44 @@ def _parse_book_info_page(data, book_id: str) -> BookInfo:
None
)

every_url = soup.find_all('a')
slow_urls_no_waitlist = set()
slow_urls_with_waitlist = set()
external_urls_libgen = set()
external_urls_z_lib = set()


for url in every_url:
try:
if url.parent.text.strip().lower().startswith("option #"):
if url.text.strip().lower().startswith("slow partner server"):
if url.next is not None and url.next.next is not None and "waitlist" in url.next.next.strip().lower():
internal_text = url.next.next.strip().lower()
if "no waitlist" in internal_text:
slow_urls_no_waitlist.add(url['href'])
else:
slow_urls_with_waitlist.add(url['href'])
elif url.next is not None and url.next.next is not None and "click “GET” at the top" in url.next.next.text.strip():
external_urls_libgen.add(url['href'])
elif url.text.strip().lower().startswith("z-lib"):
if ".onion/" not in url['href']:
external_urls_z_lib.add(url['href'])
except:
pass


slow_urls_no_waitlist = list(slow_urls_no_waitlist)
slow_urls_with_waitlist = list(slow_urls_with_waitlist)
external_urls_libgen = list(external_urls_libgen)
external_urls_z_lib = list(external_urls_z_lib)

if USE_CF_BYPASS:
urls = slow_urls_no_waitlist + external_urls_libgen + slow_urls_with_waitlist + external_urls_z_lib
else:
urls = external_urls_libgen + external_urls_z_lib + slow_urls_no_waitlist + slow_urls_with_waitlist
for i in range(len(urls)):
urls[i] = network.get_absolute_url(AA_BASE_URL, urls[i])

# Extract basic information
book_info = BookInfo(
id=book_id,
Expand All @@ -146,7 +186,8 @@ def _parse_book_info_page(data, book_id: str) -> BookInfo:
publisher=divs[start_div_id + 1].next,
author=divs[start_div_id + 2].next,
format=format,
size=size
size=size,
download_urls=urls
)

# Extract additional metadata
Expand Down Expand Up @@ -198,7 +239,7 @@ def _extract_book_metadata(metadata_divs) -> Dict[str, List[str]]:
and "filename" not in k.lower()
}

def download_book(book_id: str, title: str) -> Optional[BytesIO]:
def download_book(book_info: BookInfo) -> Optional[BytesIO]:
"""Download a book from available sources.

Args:
Expand All @@ -209,27 +250,24 @@ def download_book(book_id: str, title: str) -> Optional[BytesIO]:
Optional[BytesIO]: Book content buffer if successful
"""

download_links = [
f"https://annas-archive.org/slow_download/{book_id}/0/2",
f"https://libgen.li/ads.php?md5={book_id}",
f"https://library.lol/fiction/{book_id}",
f"https://library.lol/main/{book_id}",
f"https://annas-archive.org/slow_download/{book_id}/0/0",
f"https://annas-archive.org/slow_download/{book_id}/0/1"
]

"""If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources."""

if len(book_info.download_urls) == 0:
book_info = get_book_info(book_info.id)
download_links = book_info.download_urls

# If AA_DONATOR_KEY is set, use the fast download URL. Else try other sources.
if AA_DONATOR_KEY is not None:
download_links.insert(0,
f"https://annas-archive.org/dyn/api/fast_download.json?md5={book_id}&key={AA_DONATOR_KEY}"
f"{AA_BASE_URL}/dyn/api/fast_download.json?md5={book_info.id}&key={AA_DONATOR_KEY}"
)

for link in download_links:
try:
download_url = _get_download_url(link, title)
download_url = _get_download_url(link, book_info.title)
if download_url:
logger.info(f"Downloading {title} from {download_url}")
return network.download_url(download_url)
logger.info(f"Downloading {book_info.title} from {download_url}")
return network.download_url(download_url, book_info.size)
except Exception as e:
logger.error(f"Failed to download from {link}: {e}")
continue
Expand All @@ -239,35 +277,27 @@ def download_book(book_id: str, title: str) -> Optional[BytesIO]:
def _get_download_url(link: str, title: str) -> Optional[str]:
"""Extract actual download URL from various source pages."""

if link.startswith("https://annas-archive.org/dyn/api/fast_download.json"):
if link.startswith(f"{AA_BASE_URL}/dyn/api/fast_download.json"):
page = network.html_get_page(link)
return json.loads(page).get("download_url")

html = network.html_get_page_cf(link)

try:
html = network.html_get_page(link, retry=0, skip_403=True)
except:
html = network.html_get_page_cf(link)

if not html:
return None

soup = BeautifulSoup(html, 'html.parser')
url = None

if link.startswith("https://z-lib.gs"):
download_link = soup.find_all('a', href=True, class_="addDownloadedBook")
if download_link:
parsed = urlparse(download_link[0]['href'])
return f"{parsed.scheme}://{parsed.netloc}{download_link[0]['href']}"

elif link.startswith("https://libgen.li"):
get_section = soup.find_all('h2', string="GET")
if get_section:
href = get_section[0].parent['href']
parsed = urlparse(href)
return f"{parsed.scheme}://{parsed.netloc}/{href}"

elif link.startswith("https://library.lol/fiction/"):
get_section = soup.find_all('h2', string="GET")
if get_section:
return get_section[0].parent['href']

elif link.startswith("https://annas-archive.org/slow_download/"):
url = download_link[0]['href']
elif link.startswith(f"{AA_BASE_URL}/slow_download/"):
download_links = soup.find_all('a', href=True, string="📚 Download now")
if not download_links:
countdown = soup.find_all('span', class_="js-partner-countdown")
Expand All @@ -277,6 +307,8 @@ def _get_download_url(link: str, title: str) -> Optional[str]:
time.sleep(sleep_time + 5)
return _get_download_url(link, title)
else:
return download_links[0]['href']

return None
url = download_links[0]['href']
else:
url = soup.find_all('a', string="GET")[0]['href']

return network.get_absolute_url(link, url)
5 changes: 5 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@
MAX_RETRY = int(os.getenv("MAX_RETRY", 3))
DEFAULT_SLEEP = int(os.getenv("DEFAULT_SLEEP", 5))
CLOUDFLARE_PROXY = os.getenv("CLOUDFLARE_PROXY_URL", "http://localhost:8000")
USE_CF_BYPASS = os.getenv("USE_CF_BYPASS", "false").lower()
USE_CF_BYPASS = USE_CF_BYPASS.lower() in ["true", "yes", "1", "y"]

# Anna's Archive settings
AA_DONATOR_KEY = os.getenv("AA_DONATOR_KEY", None)
AA_BASE_URL = os.getenv("AA_BASE_URL", "https://annas-archive.org").strip("/")

# File format settings
SUPPORTED_FORMATS = os.getenv("SUPPORTED_FORMATS", "epub,mobi,azw3,fb2,djvu,cbz,cbr")
Expand Down
13 changes: 7 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ services:
BOOK_LANGUAGE: en
ports:
- 8084:8084
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"]
interval: 30s
timeout: 30s
retries: 3
start_period: 5s
# Uncomment the following lines if you want to enable healthcheck
#healthcheck:
# test: ["CMD", "curl", "-f", "http://localhost:8084/request/api/status"]
# interval: 30s
# timeout: 30s
# retries: 3
# start_period: 5s
restart: unless-stopped
volumes:
# This is where the books will be downloaded to, usually it would be
Expand Down
3 changes: 2 additions & 1 deletion models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Data structures and models used across the application."""

from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from enum import Enum
from config import INGEST_DIR, STATUS_TIMEOUT
Expand All @@ -27,6 +27,7 @@ class BookInfo:
format: Optional[str] = None
size: Optional[str] = None
info: Optional[Dict[str, List[str]]] = None
download_urls: Optional[List[str]] = field(default_factory=list)

class BookQueue:
"""Thread-safe book queue manager."""
Expand Down
45 changes: 40 additions & 5 deletions network.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from io import BytesIO
import urllib.request
from typing import Optional
from urllib.parse import urlparse
from tqdm import tqdm

from logger import setup_logger
from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, AA_DONATOR_KEY
from config import MAX_RETRY, DEFAULT_SLEEP, CLOUDFLARE_PROXY, USE_CF_BYPASS

logger = setup_logger(__name__)

Expand All @@ -23,7 +25,7 @@ def setup_urllib_opener():

setup_urllib_opener()

def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False) -> Optional[str]:
def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False, skip_403: bool = False) -> Optional[str]:
"""Fetch HTML content from a URL with retry mechanism.

Args:
Expand All @@ -41,6 +43,10 @@ def html_get_page(url: str, retry: int = MAX_RETRY, skip_404: bool = False) -> O
if skip_404 and response.status_code == 404:
logger.warning(f"404 error for URL: {url}")
return None

if skip_403 and response.status_code == 403:
logger.warning(f"403 error for URL: {url}. Should retry using cloudflare bypass.")
return None

response.raise_for_status()
time.sleep(1)
Expand Down Expand Up @@ -68,6 +74,9 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
Returns:
str: HTML content if successful, None otherwise
"""
if USE_CF_BYPASS == False:
logger.warning("Cloudflare bypass is disabled, trying without it.")
return html_get_page(url, retry, skip_403=True)
try:
logger.info(f"GET_CF: {url}")
response = requests.get(
Expand All @@ -88,7 +97,7 @@ def html_get_page_cf(url: str, retry: int = MAX_RETRY) -> Optional[str]:
time.sleep(sleep_time)
return html_get_page_cf(url, retry - 1)

def download_url(link: str) -> Optional[BytesIO]:
def download_url(link: str, size: str = None) -> Optional[BytesIO]:
"""Download content from URL into a BytesIO buffer.

Args:
Expand All @@ -101,11 +110,37 @@ def download_url(link: str) -> Optional[BytesIO]:
logger.info(f"Downloading from: {link}")
response = requests.get(link, stream=True)
response.raise_for_status()

try:
total_size = size.strip().replace(" ", "").replace(",", ".").upper()
# we assume size is in MB
total_size = int(float(total_size[:-2].strip()) * 1024 * 1024)
except:
total_size = int(response.headers.get('content-length', 0))

buffer = BytesIO()
buffer.write(response.content)
for chunk in tqdm(response.iter_content(chunk_size=1024), total=total_size, unit='B', unit_scale=True, unit_divisor=1024):
buffer.write(chunk)
buffer.seek(0)
return buffer

except requests.exceptions.RequestException as e:
logger.error(f"Failed to download from {link}: {e}")
return None
return None

def get_absolute_url(base_url: str, url: str) -> str:
"""Get absolute URL from relative URL and base URL.

Args:
base_url: Base URL
url: Relative URL
"""
if url == None or url.strip() == "":
return None
if url.startswith("http"):
return url
parsed_url = urlparse(url)
parsed_base = urlparse(base_url)
if parsed_url.netloc == "" or parsed_url.scheme == "":
parsed_url = parsed_url._replace(netloc=parsed_base.netloc, scheme=parsed_base.scheme)
return parsed_url.geturl()
10 changes: 10 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,15 @@ An intuitive web interface for searching and requesting book downloads, designed

Note that PDF are NOT supported at the moment (they do not get ingested by CWA, but if you want to just download them locally, you can add `pdf` to the `SUPPORTED_FORMATS` env

#### AA

| Variable | Description | Default Value |
| ---------------------- | --------------------------------------------------------- | --------------------------------- |
| `AA_BASE_URL` | Base URL of Annas-Archive (could be changed for a proxy) | `https://annas-archive.org` |
| `USE_CF_BYPASS` | Disable CF bypass and use alternative links instead | `true` |

If you are a donator on AA, you can use your Key in `AA_DONATOR_API_KEY` to speed up downloads and bypass the wait times.
If diabling the cloduflare bypass, you will be using alternative download hosts, such as libgen or z-lib, but they usually have a delay before getting the more recent books and their collection is not as big as aa's. But this setting should work for the majority of books.

#### Network Settings

Expand All @@ -82,6 +90,8 @@ If you are a donator on AA, you can use your Key in `AA_DONATOR_API_KEY` to spee
| `CLOUDFLARE_PROXY_URL` | Cloudflare bypass service URL | `http://localhost:8000` |
| `PORT` | Container external port | `8084` |

`CLOUDFLARE_PROXY_URL` is ignored if `USE_CF_BYPASS` is set to `false`

### Volume Configuration

```yaml
Expand Down
Loading