Add Range header support, update docs, fix minor bugs, enhance logging

mjishnu · Jan 3, 2025 · a5267d8 · a5267d8
1 parent 5dcd938
commit a5267d8
Show file tree

Hide file tree

Showing 7 changed files with 272 additions and 164 deletions.
diff --git a/README.md b/README.md
@@ -95,7 +95,7 @@ Each option is explained below:
     - `data`: The data to send in the body of the request. The default value is `None`.
     - `json`: A JSON-compatible Python object to send in the body of the request. The default value is `None`.
     - `cookies`: HTTP Cookies to send with the request. The default value is `None`.
-    - `headers`: HTTP Headers to send with the request. The default value is `None`.
+    - `headers`: HTTP headers to be sent with the request. The default value is `None`. *Please note that [multi-range headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range#requesting_multiple_ranges) are not supported*.
     - `auth`: An object that represents HTTP Basic Authorization. The default value is `None`.
     - `allow_redirects`: If set to False, do not follow redirects. The default value is `True`.
     - `max_redirects`: Maximum number of redirects to follow. The default value is `10`.
@@ -116,15 +116,15 @@ import aiohttp
 from pypdl import Pypdl
 
 def main():
-    # Using headers
-    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}
+    # Using headers 
+    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", "range":"bytes=-10485760"}
     # Using proxy
     proxy = "http://user:[email protected]"
     # Using timeout
     timeout = aiohttp.ClientTimeout(sock_read=20)
 
     # create a new pypdl object
-    dl = Pypdl(headers=headers, proxy=proxy, timeout=timeout)
+    dl = Pypdl()
 
     # start the download
     dl.start(
@@ -136,13 +136,16 @@ def main():
         block=True,
         retries=3,
         etag=True,
+        headers=headers, 
+        proxy=proxy, 
+        timeout=timeout
     )
 
 if __name__ == '__main__':
     main()
 ```
 
-This example downloads a file from the internet using 10 segments and displays the download progress. If the download fails, it will retry up to 3 times. we are also using headers, proxy and timeout, For more info regarding these parameters refer [API reference](https://github.com/mjishnu/pypdl?tab=readme-ov-file#pypdl-1)
+This example downloads a file from the internet using 10 segments and displays the download progress. If the download fails, it will retry up to 3 times. We are also using headers to set the User-Agent and Range to download the last 10MB of the file, as well as a proxy and timeout. For more information on these parameters, refer to the [API reference](https://github.com/mjishnu/pypdl?tab=readme-ov-file#pypdl-1).
 
 Another example of implementing pause resume functionality, printing the progress to console and changing log level to debug:
 
@@ -274,7 +277,7 @@ from pypdl import pypdl
 proxy = "http://user:[email protected]"
 
 # create a pypdl object with max_concurrent set to 2
-dl = pypdl(max_concurrent=2, allow_reuse=True, proxy=proxy)
+dl = pypdl(max_concurrent=2, allow_reuse=True)
 
 # List of tasks to be downloaded..
 tasks = [
@@ -285,8 +288,8 @@ tasks = [
     {'url':'https://example.com/file5.zip', 'file_path': 'file5.zip'},
 ]
 
-# start the download process
-results = dl.start(tasks=tasks, display=True, block=False)
+# start the download process with proxy
+results = dl.start(tasks=tasks, display=True, block=False,proxy=proxy)
 
 # do something
 # ...
@@ -297,8 +300,8 @@ dl.stop()
 # do something
 # ...
 
-# restart the download process
-results = factory.start(tasks=tasks, display=True, block=True)
+# restart the download process without proxy
+results = dl.start(tasks=tasks, display=True, block=True)
 
 # print the results
 for url, result in results:
@@ -398,7 +401,7 @@ block=True
         - `data`: The data to send in the body of the request. The default value is `None`.
         - `json`: A JSON-compatible Python object to send in the body of the request. The default value is `None`.
         - `cookies`: HTTP Cookies to send with the request. The default value is `None`.
-        - `headers`: HTTP Headers to send with the request. The default value is `None`.
+        - `headers`: HTTP headers to be sent with the request. The default value is `None`. *Please note that [multi-range headers](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range#requesting_multiple_ranges) are not supported*.
         - `auth`: An object that represents HTTP Basic Authorization. The default value is `None`.
         - `allow_redirects`: If set to False, do not follow redirects. The default value is `True`.
         - `max_redirects`: Maximum number of redirects to follow. The default value is `10`.

diff --git a/pypdl/consumer.py b/pypdl/consumer.py
@@ -38,8 +38,8 @@ async def process_tasks(self, in_queue, out_queue):
             except asyncio.CancelledError:
                 raise
             except Exception as e:
-                self.logger.exception("Task %s failed", self.id)
-                self.logger.error(e)
+                self.logger.debug("Task %s failed", self.id)
+                self.logger.exception(e)
                 await out_queue.put([task[0]])
 
             self._workers.clear()

diff --git a/pypdl/downloader.py b/pypdl/downloader.py
@@ -53,26 +53,26 @@ async def worker(self, segment_table: dict, id: int, **kwargs) -> None:
         url = segment_table["url"]
         overwrite = segment_table["overwrite"]
         segment_path = segment_table[id]["segment_path"]
-        start = segment_table[id]["start"]
-        end = segment_table[id]["end"]
         size = segment_table[id]["segment_size"]
 
         if await aiofiles.os.path.exists(segment_path):
             downloaded_size = await aiofiles.os.path.getsize(segment_path)
-            if overwrite or downloaded_size > size:
+            if overwrite or downloaded_size > size.value:
                 await aiofiles.os.remove(segment_path)
             else:
                 self.curr = downloaded_size
 
         if kwargs.get("headers") is not None:
             kwargs["headers"] = kwargs["headers"].copy()
 
-        if self.curr < size:
-            start = start + self.curr
-            kwargs.setdefault("headers", {}).update({"range": f"bytes={start}-{end}"})
+        if self.curr < size.value:
+            start = size.start + self.curr
+            kwargs.setdefault("headers", {}).update(
+                {"range": f"bytes={start}-{size.end}"}
+            )
             await self.download(url, segment_path, "ab", **kwargs)
 
-        if self.curr == size:
+        if self.curr == size.value:
             self.completed = True
         else:
             raise Exception(

diff --git a/pypdl/producer.py b/pypdl/producer.py
@@ -1,5 +1,5 @@
 import asyncio
-from .utils import get_filepath
+from .utils import get_filepath, Size, get_range
 
 
 class Producer:
@@ -38,6 +38,7 @@ async def enqueue_tasks(self, in_queue: asyncio.queues, out_queue):
                             multisegment,
                             etag,
                             size,
+                            kwargs,
                         ) = await self._fetch_task_info(
                             task.url, task.file_path, task.multisegment, **task.kwargs
                         )
@@ -49,15 +50,15 @@ async def enqueue_tasks(self, in_queue: asyncio.queues, out_queue):
                         self.logger.debug(
                             f"Failed to get header for {task}, skipping task"
                         )
-                        self.logger.error(e)
+                        self.logger.exception(e)
                         continue
 
-                    if size == 0:
+                    if size.value == 0:
                         self.logger.debug("Size is Unavailable, setting size to None")
                         self.size_avail = False
 
-                    total_size -= task.size
-                    total_size += size
+                    total_size -= task.size.value
+                    total_size += size.value
                     task.size = size
                     await out_queue.put(
                         (
@@ -72,7 +73,7 @@ async def enqueue_tasks(self, in_queue: asyncio.queues, out_queue):
                                 task.overwrite,
                                 task.speed_limit,
                                 task.etag_validation,
-                                task.kwargs,
+                                kwargs,
                             ),
                         )
                     )
@@ -90,21 +91,41 @@ async def _fetch_task_info(self, url, file_path, multisegment, **kwargs):
         if callable(url):
             url = url()
 
+        user_headers = kwargs.get("headers", {})
+        range_header = None
+
+        if user_headers:
+            _user_headers = user_headers.copy()
+            for key, value in user_headers.items():
+                if key.lower() == "range":
+                    range_header = value
+                    self.logger.debug("Range header found %s", range_header)
+                    del _user_headers[key]
+            kwargs["headers"] = _user_headers
+
         header = await self._fetch_header(url, **kwargs)
         file_path = await get_filepath(url, header, file_path)
-        if size := int(header.get("content-length", 0)):
-            self.logger.debug("Size acquired from header")
+        if file_size := int(header.get("content-length", 0)):
+            self.logger.debug("File size acquired from header")
+
+        if range_header:
+            start, end = get_range(range_header, file_size)
+        else:
+            start = 0
+            end = file_size - 1
 
+        size = Size(start, end)
         etag = header.get("etag", "")
         if etag != "":
             self.logger.debug("ETag acquired from header")
             etag = etag.strip('"')
 
-        if not size or not header.get("accept-ranges"):
+        if size.value < 1 or not header.get("accept-ranges"):
             self.logger.debug("Single segment mode, accept-ranges or size not found")
+            kwargs["headers"] = user_headers
             multisegment = False
 
-        return url, file_path, multisegment, etag, size
+        return url, file_path, multisegment, etag, size, kwargs
 
     async def _fetch_header(self, url, **kwargs):
         try:

diff --git a/pypdl/pypdl.py b/pypdl/pypdl.py
@@ -219,7 +219,7 @@ def stop(self) -> None:
         """Stop the download manager."""
         self._logger.debug("stop called")
         if self.is_idle or self.completed:
-            self._logger.debug("Task not running")
+            self._logger.debug("Task not running, nothing to stop")
             return None
         self._future._stop()
         self._interrupt.set()