Skip to content

Commit

Permalink
Add Range header support, update docs, fix minor bugs, enhance logging
Browse files Browse the repository at this point in the history
  • Loading branch information
mjishnu committed Jan 3, 2025
1 parent 5dcd938 commit a5267d8
Show file tree
Hide file tree
Showing 7 changed files with 272 additions and 164 deletions.
25 changes: 14 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Each option is explained below:
- `data`: The data to send in the body of the request. The default value is `None`.
- `json`: A JSON-compatible Python object to send in the body of the request. The default value is `None`.
- `cookies`: HTTP Cookies to send with the request. The default value is `None`.
- `headers`: HTTP Headers to send with the request. The default value is `None`.
- `headers`: HTTP headers to be sent with the request. The default value is `None`. *Please note that [multi-range headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range#requesting_multiple_ranges) are not supported*.
- `auth`: An object that represents HTTP Basic Authorization. The default value is `None`.
- `allow_redirects`: If set to False, do not follow redirects. The default value is `True`.
- `max_redirects`: Maximum number of redirects to follow. The default value is `10`.
Expand All @@ -116,15 +116,15 @@ import aiohttp
from pypdl import Pypdl

def main():
# Using headers
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}
# Using headers
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", "range":"bytes=-10485760"}
# Using proxy
proxy = "http://user:[email protected]"
# Using timeout
timeout = aiohttp.ClientTimeout(sock_read=20)

# create a new pypdl object
dl = Pypdl(headers=headers, proxy=proxy, timeout=timeout)
dl = Pypdl()

# start the download
dl.start(
Expand All @@ -136,13 +136,16 @@ def main():
block=True,
retries=3,
etag=True,
headers=headers,
proxy=proxy,
timeout=timeout
)

if __name__ == '__main__':
main()
```

This example downloads a file from the internet using 10 segments and displays the download progress. If the download fails, it will retry up to 3 times. we are also using headers, proxy and timeout, For more info regarding these parameters refer [API reference](https://github.com/mjishnu/pypdl?tab=readme-ov-file#pypdl-1)
This example downloads a file from the internet using 10 segments and displays the download progress. If the download fails, it will retry up to 3 times. We are also using headers to set the User-Agent and Range to download the last 10MB of the file, as well as a proxy and timeout. For more information on these parameters, refer to the [API reference](https://github.com/mjishnu/pypdl?tab=readme-ov-file#pypdl-1).

Another example of implementing pause resume functionality, printing the progress to console and changing log level to debug:

Expand Down Expand Up @@ -274,7 +277,7 @@ from pypdl import pypdl
proxy = "http://user:[email protected]"

# create a pypdl object with max_concurrent set to 2
dl = pypdl(max_concurrent=2, allow_reuse=True, proxy=proxy)
dl = pypdl(max_concurrent=2, allow_reuse=True)

# List of tasks to be downloaded..
tasks = [
Expand All @@ -285,8 +288,8 @@ tasks = [
{'url':'https://example.com/file5.zip', 'file_path': 'file5.zip'},
]

# start the download process
results = dl.start(tasks=tasks, display=True, block=False)
# start the download process with proxy
results = dl.start(tasks=tasks, display=True, block=False,proxy=proxy)

# do something
# ...
Expand All @@ -297,8 +300,8 @@ dl.stop()
# do something
# ...

# restart the download process
results = factory.start(tasks=tasks, display=True, block=True)
# restart the download process without proxy
results = dl.start(tasks=tasks, display=True, block=True)

# print the results
for url, result in results:
Expand Down Expand Up @@ -398,7 +401,7 @@ block=True
- `data`: The data to send in the body of the request. The default value is `None`.
- `json`: A JSON-compatible Python object to send in the body of the request. The default value is `None`.
- `cookies`: HTTP Cookies to send with the request. The default value is `None`.
- `headers`: HTTP Headers to send with the request. The default value is `None`.
- `headers`: HTTP headers to be sent with the request. The default value is `None`. *Please note that [multi-range headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range#requesting_multiple_ranges) are not supported*.
- `auth`: An object that represents HTTP Basic Authorization. The default value is `None`.
- `allow_redirects`: If set to False, do not follow redirects. The default value is `True`.
- `max_redirects`: Maximum number of redirects to follow. The default value is `10`.
Expand Down
4 changes: 2 additions & 2 deletions pypdl/consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ async def process_tasks(self, in_queue, out_queue):
except asyncio.CancelledError:
raise
except Exception as e:
self.logger.exception("Task %s failed", self.id)
self.logger.error(e)
self.logger.debug("Task %s failed", self.id)
self.logger.exception(e)
await out_queue.put([task[0]])

self._workers.clear()
Expand Down
14 changes: 7 additions & 7 deletions pypdl/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,26 +53,26 @@ async def worker(self, segment_table: dict, id: int, **kwargs) -> None:
url = segment_table["url"]
overwrite = segment_table["overwrite"]
segment_path = segment_table[id]["segment_path"]
start = segment_table[id]["start"]
end = segment_table[id]["end"]
size = segment_table[id]["segment_size"]

if await aiofiles.os.path.exists(segment_path):
downloaded_size = await aiofiles.os.path.getsize(segment_path)
if overwrite or downloaded_size > size:
if overwrite or downloaded_size > size.value:
await aiofiles.os.remove(segment_path)
else:
self.curr = downloaded_size

if kwargs.get("headers") is not None:
kwargs["headers"] = kwargs["headers"].copy()

if self.curr < size:
start = start + self.curr
kwargs.setdefault("headers", {}).update({"range": f"bytes={start}-{end}"})
if self.curr < size.value:
start = size.start + self.curr
kwargs.setdefault("headers", {}).update(
{"range": f"bytes={start}-{size.end}"}
)
await self.download(url, segment_path, "ab", **kwargs)

if self.curr == size:
if self.curr == size.value:
self.completed = True
else:
raise Exception(
Expand Down
41 changes: 31 additions & 10 deletions pypdl/producer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import asyncio
from .utils import get_filepath
from .utils import get_filepath, Size, get_range


class Producer:
Expand Down Expand Up @@ -38,6 +38,7 @@ async def enqueue_tasks(self, in_queue: asyncio.queues, out_queue):
multisegment,
etag,
size,
kwargs,
) = await self._fetch_task_info(
task.url, task.file_path, task.multisegment, **task.kwargs
)
Expand All @@ -49,15 +50,15 @@ async def enqueue_tasks(self, in_queue: asyncio.queues, out_queue):
self.logger.debug(
f"Failed to get header for {task}, skipping task"
)
self.logger.error(e)
self.logger.exception(e)
continue

if size == 0:
if size.value == 0:
self.logger.debug("Size is Unavailable, setting size to None")
self.size_avail = False

total_size -= task.size
total_size += size
total_size -= task.size.value
total_size += size.value
task.size = size
await out_queue.put(
(
Expand All @@ -72,7 +73,7 @@ async def enqueue_tasks(self, in_queue: asyncio.queues, out_queue):
task.overwrite,
task.speed_limit,
task.etag_validation,
task.kwargs,
kwargs,
),
)
)
Expand All @@ -90,21 +91,41 @@ async def _fetch_task_info(self, url, file_path, multisegment, **kwargs):
if callable(url):
url = url()

user_headers = kwargs.get("headers", {})
range_header = None

if user_headers:
_user_headers = user_headers.copy()
for key, value in user_headers.items():
if key.lower() == "range":
range_header = value
self.logger.debug("Range header found %s", range_header)
del _user_headers[key]
kwargs["headers"] = _user_headers

header = await self._fetch_header(url, **kwargs)
file_path = await get_filepath(url, header, file_path)
if size := int(header.get("content-length", 0)):
self.logger.debug("Size acquired from header")
if file_size := int(header.get("content-length", 0)):
self.logger.debug("File size acquired from header")

if range_header:
start, end = get_range(range_header, file_size)
else:
start = 0
end = file_size - 1

size = Size(start, end)
etag = header.get("etag", "")
if etag != "":
self.logger.debug("ETag acquired from header")
etag = etag.strip('"')

if not size or not header.get("accept-ranges"):
if size.value < 1 or not header.get("accept-ranges"):
self.logger.debug("Single segment mode, accept-ranges or size not found")
kwargs["headers"] = user_headers
multisegment = False

return url, file_path, multisegment, etag, size
return url, file_path, multisegment, etag, size, kwargs

async def _fetch_header(self, url, **kwargs):
try:
Expand Down
2 changes: 1 addition & 1 deletion pypdl/pypdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def stop(self) -> None:
"""Stop the download manager."""
self._logger.debug("stop called")
if self.is_idle or self.completed:
self._logger.debug("Task not running")
self._logger.debug("Task not running, nothing to stop")
return None
self._future._stop()
self._interrupt.set()
Expand Down
Loading

0 comments on commit a5267d8

Please sign in to comment.