V4.0.0 (#159)

1. migrate from httpx to curl_cffi 2. CURL: simplified downloads, use ThreadPoolExecutor instead of asyncio 3. exceptions: simplified, use DuckDuckGoSearchException and VQDExtractionException 4. github workflow: added windows and macos 5. tests: removed delays between tests, activated tests for text(backend="lite") 6. CLI: save_json() - set encoding="utf-8" (bugfix for windows)
deedy5 · Dec 12, 2023 · 8211fe6 · 8211fe6
1 parent 99dbfa0
commit 8211fe6
Show file tree

Hide file tree

Showing 13 changed files with 81 additions and 150 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,10 +12,12 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+
     strategy:
       matrix:
         python-version: ["3.8", "3.12"]
+        os: [ubuntu-latest, macos-latest, windows-latest]
 
     steps:
     - uses: actions/checkout@v3
@@ -25,9 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        python -m pip install ruff pytest pytest-asyncio
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python -m pip install .[dev]
     - name: Ruff
       run: |
         ruff format . --check --target-version py38

diff --git a/.gitignore b/.gitignore
@@ -157,10 +157,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# vqd_cache
-/vqd_cache
-
-# vscode
-.vscode
+#.idea/
diff --git a/README.md b/README.md
@@ -38,21 +38,21 @@ python -m duckduckgo_search --help
 CLI examples:
 ```python3
 # text search
-ddgs text -k 'ayrton senna'
+ddgs text -k "ayrton senna"
 # text search via proxy (example: Tor Browser)
-ddgs text -k 'china is a global threat' -p socks5://localhost:9150
+ddgs text -k "china is a global threat" -p socks5://localhost:9150
 # find and download pdf files
 ddgs text -k "russia filetype:pdf" -m 50 -d
 # find in es-es region and download pdf files via proxy (example: Tor browser)
 ddgs text -k "embajada a tamorlán filetype:pdf" -r es-es -m 50 -d -p socks5://localhost:9150
 # find and download xls files from a specific site
-ddgs text -k 'sanctions filetype:xls site:gov.ua' -m 50 -d
+ddgs text -k "sanctions filetype:xls site:gov.ua" -m 50 -d
 # find and download any doc(x) files from a specific site
-ddgs text -k 'filetype:doc site:mos.ru' -m 50 -d
+ddgs text -k "filetype:doc site:mos.ru" -m 50 -d
 # find and download images
 ddgs images -k "yuri kuklachev cat theatre" -m 500 -s off -d
 # find in br-br region and download images via proxy (example: Tor browser) in 10 threads
-ddgs images -k 'rio carnival' -r br-br -s off -m 500 -d -th 10 -p socks5://localhost:9150
+ddgs images -k "rio carnival" -r br-br -s off -m 500 -d -th 10 -p socks5://localhost:9150
 # get latest news
 ddgs news -k "ukraine war" -s off -t d -m 10
 # get last day's news and save it to a csv file

diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
@@ -1,16 +1,14 @@
-import asyncio
 import csv
 import json
 import logging
 import os
-import ssl
+from concurrent.futures import as_completed, ThreadPoolExecutor
 from datetime import datetime
 from random import choice
 from urllib.parse import unquote
 
-import aiofiles
 import click
-import httpx
+from curl_cffi import requests
 
 from .duckduckgo_search import DDGS, USERAGENTS
 from .version import __version__
@@ -36,7 +34,7 @@
 
 
 def save_json(jsonfile, data):
-    with open(jsonfile, "w") as file:
+    with open(jsonfile, "w", encoding="utf-8") as file:
         json.dump(data, file, ensure_ascii=False, indent=4)
 
 
@@ -80,47 +78,40 @@ def sanitize_keywords(keywords):
     return keywords
 
 
-async def download_file(url, dir_path, filename, sem, proxy):
+def download_file(url, dir_path, filename, proxy):
     headers = {"User-Agent": choice(USERAGENTS)}
-    for i in range(2):
-        try:
-            async with sem, httpx.AsyncClient(headers=headers, proxies=proxy, timeout=10) as client:
-                async with client.stream("GET", url) as resp:
-                    resp.raise_for_status()
-                    async with aiofiles.open(os.path.join(dir_path, filename[:200]), "wb") as file:
-                        async for chunk in resp.aiter_bytes():
-                            await file.write(chunk)
-                    break
-        except (httpx.HTTPError, ssl.SSLCertVerificationError, ssl.SSLError) as ex:
-            logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
-        except ValueError as ex:
-            raise ex
-
-
-async def _download_results(keywords, results, images=False, proxy=None, threads=None):
+    try:
+        with requests.Session(headers=headers, proxies=proxy, impersonate="chrome110") as session:
+            resp = session.get(url, stream=True)
+            resp.raise_for_status()
+            with open(os.path.join(dir_path, filename[:200]), "wb") as file:
+                for chunk in resp.iter_content():
+                    file.write(chunk)
+    except Exception as ex:
+        logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
+
+
+def download_results(keywords, results, images=False, proxy=None, threads=None):
     path_type = "images" if images else "text"
     path = f"{path_type}_{keywords}_{datetime.now():%Y%m%d_%H%M%S}"
     os.makedirs(path, exist_ok=True)
+    proxy = {"http": proxy, "https": proxy}
 
     threads = 10 if threads is None else threads
-    sem = asyncio.Semaphore(threads)
-    tasks = []
-    for i, res in enumerate(results, start=1):
-        url = res["image"] if images else res["href"]
-        filename = unquote(url.split("/")[-1].split("?")[0])
-        task = asyncio.create_task(download_file(url, path, f"{i}_{filename}", sem, proxy))
-        tasks.append(task)
-
-    with click.progressbar(length=len(tasks), label="Downloading", show_percent=True, show_pos=True, width=50) as bar:
-        for future in asyncio.as_completed(tasks):
-            await future
-            bar.update(1)
-
-    await asyncio.gather(*tasks)
-
-
-def download_results(keywords, results, images=False, proxy=None, threads=None):
-    asyncio.run(_download_results(keywords, results, images, proxy))
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        futures = []
+        for i, res in enumerate(results, start=1):
+            url = res["image"] if images else res["href"]
+            filename = unquote(url.split("/")[-1].split("?")[0])
+            f = executor.submit(download_file, url, path, f"{i}_{filename}", proxy)
+            futures.append(f)
+
+        with click.progressbar(
+            length=len(futures), label="Downloading", show_percent=True, show_pos=True, width=50
+        ) as bar:
+            for future in as_completed(futures):
+                future.result()
+                bar.update(1)
 
 
 @click.group(chain=True)

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -1,4 +1,3 @@
-import json
 import logging
 from collections import deque
 from datetime import datetime, timezone
@@ -8,10 +7,10 @@
 from time import sleep
 from typing import Deque, Dict, Iterator, Optional, Set, Tuple
 
-import httpx
 from lxml import html
+from curl_cffi import requests
 
-from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
+from .exceptions import DuckDuckGoSearchException
 from .models import MapsResult
 from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
 
@@ -31,31 +30,25 @@ def __init__(self, headers=None, proxies=None, timeout=10) -> None:
         if headers is None:
             headers = HEADERS
             headers["User-Agent"] = choice(USERAGENTS)
-        self.proxies = proxies
-        self._client = httpx.Client(headers=headers, proxies=proxies, timeout=timeout, http2=True, verify=False)
+        self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
+        self._session = requests.Session(
+            headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
+        )
 
     def __enter__(self) -> "DDGS":
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        self._client.close()
+        self._session.close()
 
-    def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]:
+    def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
         try:
-            resp = self._client.request(method, url, follow_redirects=True, **kwargs)
-            if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
-                raise APIException(f"_get_url() {url}")
-            if resp.status_code == 202:
-                raise RateLimitException(f"_get_url() {url}")
+            resp = self._session.request(method, url, **kwargs)
+            resp.raise_for_status()
+            if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
+                raise
             if resp.status_code == 200:
                 return resp
-            resp.raise_for_status()
-        except httpx.TimeoutException as ex:
-            raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
-        except (APIException, RateLimitException):
-            raise
-        except httpx.HTTPError as ex:
-            raise HTTPException(f"_get_url() {url} HttpError: {ex}")
         except Exception as ex:
             raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}")
 
@@ -205,7 +198,7 @@ def _text_html(
         """
         assert keywords, "keywords is mandatory"
 
-        self._client.headers["Referer"] = "https://html.duckduckgo.com/"
+        self._session.headers["Referer"] = "https://html.duckduckgo.com/"
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
             "q": keywords,
@@ -857,7 +850,7 @@ def translate(
             "POST",
             "https://duckduckgo.com/translation.js",
             params=payload,
-            content=keywords.encode(),
+            data=keywords.encode(),
         )
         if resp is None:
             return None

diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py
@@ -7,10 +7,10 @@
 from random import choice
 from typing import AsyncIterator, Deque, Dict, Optional, Set, Tuple
 
-import httpx
 from lxml import html
+from curl_cffi import requests
 
-from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
+from .exceptions import DuckDuckGoSearchException
 from .models import MapsResult
 from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
 
@@ -30,31 +30,25 @@ def __init__(self, headers=None, proxies=None, timeout=10) -> None:
         if headers is None:
             headers = HEADERS
             headers["User-Agent"] = choice(USERAGENTS)
-        self.proxies = proxies
-        self._client = httpx.AsyncClient(headers=headers, proxies=proxies, timeout=timeout, http2=True, verify=False)
+        self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
+        self._session = requests.Session(
+            headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
+        )
 
     async def __aenter__(self) -> "AsyncDDGS":
         return self
 
     async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
-        await self._client.aclose()
+        self._session.close()
 
-    async def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]:
+    async def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
         try:
-            resp = await self._client.request(method, url, follow_redirects=True, **kwargs)
-            if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
-                raise APIException(f"_get_url() {url}")
-            if resp.status_code == 202:
-                raise RateLimitException(f"_get_url() {url}")
+            resp = self._session.request(method, url, **kwargs)
+            resp.raise_for_status()
+            if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
+                raise
             if resp.status_code == 200:
                 return resp
-            resp.raise_for_status()
-        except httpx.TimeoutException as ex:
-            raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
-        except (APIException, RateLimitException):
-            raise
-        except httpx.HTTPError as ex:
-            raise HTTPException(f"_get_url() {url} HttpError: {ex}")
         except Exception as ex:
             raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}")
 
@@ -205,7 +199,7 @@ async def _text_html(
         """
         assert keywords, "keywords is mandatory"
 
-        self._client.headers["Referer"] = "https://html.duckduckgo.com/"
+        self._session.headers["Referer"] = "https://html.duckduckgo.com/"
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
             "q": keywords,
@@ -856,7 +850,7 @@ async def translate(
             "POST",
             "https://duckduckgo.com/translation.js",
             params=payload,
-            content=keywords.encode(),
+            data=keywords.encode(),
         )
         if resp is None:
             return None

diff --git a/duckduckgo_search/exceptions.py b/duckduckgo_search/exceptions.py
@@ -2,21 +2,5 @@ class DuckDuckGoSearchException(Exception):
     """Base exception class for duckduckgo_search."""
 
 
-class APIException(DuckDuckGoSearchException):
-    """Exception raised for API errors."""
-
-
-class HTTPException(DuckDuckGoSearchException):
-    """Exception raised for HTTP errors."""
-
-
-class RateLimitException(DuckDuckGoSearchException):
-    """Exception raised for rate limit errors."""
-
-
-class TimeoutException(DuckDuckGoSearchException):
-    """Exception raised for timeout errors."""
-
-
 class VQDExtractionException(DuckDuckGoSearchException):
     """Exception raised for error in extract vqd."""
diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py
@@ -1 +1 @@
-__version__ = "3.9.11"
+__version__ = "4.0.0"
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,10 +28,9 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
 dependencies = [
-    "aiofiles>=23.2.1",
     "click>=8.1.7",
     "lxml>=4.9.3",
-    "httpx[http2,socks,brotli]>=0.25.1",
+    "curl_cffi>=0.5.10"
 ]
 dynamic = ["version"]
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
-aiofiles>=23.2.1
 click>=8.1.7
 lxml>=4.9.3
-httpx[http2,socks,brotli]>=0.25.1
+curl_cffi>=0.5.10
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,8 +1,6 @@
 import os
 import shutil
-from time import sleep
 
-import pytest
 from click.testing import CliRunner
 
 from duckduckgo_search import DDGS, __version__
@@ -11,12 +9,6 @@
 runner = CliRunner()
 
 
-@pytest.fixture(autouse=True)
-def slow_down_tests():
-    yield
-    sleep(2)
-
-
 def test_version_command():
     result = runner.invoke(cli, ["version"])
     assert result.output.strip() == __version__