From 8211fe65703f7c788db0a561c5bb13af2f11646c Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Tue, 12 Dec 2023 14:45:29 +0000 Subject: [PATCH] V4.0.0 (#159) 1. migrate from httpx to curl_cffi 2. CURL: simplified downloads, use ThreadPoolExecutor instead of asyncio 3. exceptions: simplified, use DuckDuckGoSearchException and VQDExtractionException 4. github workflow: added windows and macos 5. tests: removed delays between tests, activated tests for text(backend="lite") 6. CLI: save_json() - set encoding="utf-8" (bugfix for windows) --- .github/workflows/python-package.yml | 8 +-- .gitignore | 8 +-- README.md | 10 +-- duckduckgo_search/cli.py | 71 +++++++++----------- duckduckgo_search/duckduckgo_search.py | 35 ++++------ duckduckgo_search/duckduckgo_search_async.py | 34 ++++------ duckduckgo_search/exceptions.py | 16 ----- duckduckgo_search/version.py | 2 +- pyproject.toml | 3 +- requirements.txt | 3 +- tests/test_cli.py | 8 --- tests/test_duckduckgo_search.py | 16 ++--- tests/test_duckduckgo_search_async.py | 17 ++--- 13 files changed, 81 insertions(+), 150 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7d4f564..bad3d25 100755 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,10 +12,12 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: matrix: python-version: ["3.8", "3.12"] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v3 @@ -25,9 +27,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install ruff pytest pytest-asyncio - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m pip install .[dev] - name: Ruff run: | ruff format . --check --target-version py38 diff --git a/.gitignore b/.gitignore index fb94ccf..6769e21 100755 --- a/.gitignore +++ b/.gitignore @@ -157,10 +157,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -# vqd_cache -/vqd_cache - -# vscode -.vscode +#.idea/ \ No newline at end of file diff --git a/README.md b/README.md index 8becb2a..6f87e47 100755 --- a/README.md +++ b/README.md @@ -38,21 +38,21 @@ python -m duckduckgo_search --help CLI examples: ```python3 # text search -ddgs text -k 'ayrton senna' +ddgs text -k "ayrton senna" # text search via proxy (example: Tor Browser) -ddgs text -k 'china is a global threat' -p socks5://localhost:9150 +ddgs text -k "china is a global threat" -p socks5://localhost:9150 # find and download pdf files ddgs text -k "russia filetype:pdf" -m 50 -d # find in es-es region and download pdf files via proxy (example: Tor browser) ddgs text -k "embajada a tamorlán filetype:pdf" -r es-es -m 50 -d -p socks5://localhost:9150 # find and download xls files from a specific site -ddgs text -k 'sanctions filetype:xls site:gov.ua' -m 50 -d +ddgs text -k "sanctions filetype:xls site:gov.ua" -m 50 -d # find and download any doc(x) files from a specific site -ddgs text -k 'filetype:doc site:mos.ru' -m 50 -d +ddgs text -k "filetype:doc site:mos.ru" -m 50 -d # find and download images ddgs images -k "yuri kuklachev cat theatre" -m 500 -s off -d # find in br-br region and download images via proxy (example: Tor browser) in 10 threads -ddgs images -k 'rio carnival' -r br-br -s off -m 500 -d -th 10 -p socks5://localhost:9150 +ddgs images -k "rio carnival" -r br-br -s off -m 500 -d -th 10 -p socks5://localhost:9150 # get latest news ddgs news -k "ukraine war" -s off -t d -m 10 # get last day's news and save it to a csv file diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py index 38b2eb9..69f357e 100644 --- a/duckduckgo_search/cli.py +++ b/duckduckgo_search/cli.py @@ -1,16 +1,14 @@ -import asyncio import csv import json import logging import os -import ssl +from concurrent.futures import as_completed, ThreadPoolExecutor from datetime import datetime from random import choice from urllib.parse import unquote -import aiofiles import click -import httpx +from curl_cffi import requests from .duckduckgo_search import DDGS, USERAGENTS from .version import __version__ @@ -36,7 +34,7 @@ def save_json(jsonfile, data): - with open(jsonfile, "w") as file: + with open(jsonfile, "w", encoding="utf-8") as file: json.dump(data, file, ensure_ascii=False, indent=4) @@ -80,47 +78,40 @@ def sanitize_keywords(keywords): return keywords -async def download_file(url, dir_path, filename, sem, proxy): +def download_file(url, dir_path, filename, proxy): headers = {"User-Agent": choice(USERAGENTS)} - for i in range(2): - try: - async with sem, httpx.AsyncClient(headers=headers, proxies=proxy, timeout=10) as client: - async with client.stream("GET", url) as resp: - resp.raise_for_status() - async with aiofiles.open(os.path.join(dir_path, filename[:200]), "wb") as file: - async for chunk in resp.aiter_bytes(): - await file.write(chunk) - break - except (httpx.HTTPError, ssl.SSLCertVerificationError, ssl.SSLError) as ex: - logger.debug(f"download_file url={url} {type(ex).__name__} {ex}") - except ValueError as ex: - raise ex - - -async def _download_results(keywords, results, images=False, proxy=None, threads=None): + try: + with requests.Session(headers=headers, proxies=proxy, impersonate="chrome110") as session: + resp = session.get(url, stream=True) + resp.raise_for_status() + with open(os.path.join(dir_path, filename[:200]), "wb") as file: + for chunk in resp.iter_content(): + file.write(chunk) + except Exception as ex: + logger.debug(f"download_file url={url} {type(ex).__name__} {ex}") + + +def download_results(keywords, results, images=False, proxy=None, threads=None): path_type = "images" if images else "text" path = f"{path_type}_{keywords}_{datetime.now():%Y%m%d_%H%M%S}" os.makedirs(path, exist_ok=True) + proxy = {"http": proxy, "https": proxy} threads = 10 if threads is None else threads - sem = asyncio.Semaphore(threads) - tasks = [] - for i, res in enumerate(results, start=1): - url = res["image"] if images else res["href"] - filename = unquote(url.split("/")[-1].split("?")[0]) - task = asyncio.create_task(download_file(url, path, f"{i}_{filename}", sem, proxy)) - tasks.append(task) - - with click.progressbar(length=len(tasks), label="Downloading", show_percent=True, show_pos=True, width=50) as bar: - for future in asyncio.as_completed(tasks): - await future - bar.update(1) - - await asyncio.gather(*tasks) - - -def download_results(keywords, results, images=False, proxy=None, threads=None): - asyncio.run(_download_results(keywords, results, images, proxy)) + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = [] + for i, res in enumerate(results, start=1): + url = res["image"] if images else res["href"] + filename = unquote(url.split("/")[-1].split("?")[0]) + f = executor.submit(download_file, url, path, f"{i}_{filename}", proxy) + futures.append(f) + + with click.progressbar( + length=len(futures), label="Downloading", show_percent=True, show_pos=True, width=50 + ) as bar: + for future in as_completed(futures): + future.result() + bar.update(1) @click.group(chain=True) diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index db6efdc..27e394f 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -1,4 +1,3 @@ -import json import logging from collections import deque from datetime import datetime, timezone @@ -8,10 +7,10 @@ from time import sleep from typing import Deque, Dict, Iterator, Optional, Set, Tuple -import httpx from lxml import html +from curl_cffi import requests -from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException +from .exceptions import DuckDuckGoSearchException from .models import MapsResult from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json @@ -31,31 +30,25 @@ def __init__(self, headers=None, proxies=None, timeout=10) -> None: if headers is None: headers = HEADERS headers["User-Agent"] = choice(USERAGENTS) - self.proxies = proxies - self._client = httpx.Client(headers=headers, proxies=proxies, timeout=timeout, http2=True, verify=False) + self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies} + self._session = requests.Session( + headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110" + ) def __enter__(self) -> "DDGS": return self def __exit__(self, exc_type, exc_val, exc_tb) -> None: - self._client.close() + self._session.close() - def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]: + def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]: try: - resp = self._client.request(method, url, follow_redirects=True, **kwargs) - if _is_500_in_url(str(resp.url)) or resp.status_code == 403: - raise APIException(f"_get_url() {url}") - if resp.status_code == 202: - raise RateLimitException(f"_get_url() {url}") + resp = self._session.request(method, url, **kwargs) + resp.raise_for_status() + if _is_500_in_url(str(resp.url)) or resp.status_code == 202: + raise if resp.status_code == 200: return resp - resp.raise_for_status() - except httpx.TimeoutException as ex: - raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}") - except (APIException, RateLimitException): - raise - except httpx.HTTPError as ex: - raise HTTPException(f"_get_url() {url} HttpError: {ex}") except Exception as ex: raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}") @@ -205,7 +198,7 @@ def _text_html( """ assert keywords, "keywords is mandatory" - self._client.headers["Referer"] = "https://html.duckduckgo.com/" + self._session.headers["Referer"] = "https://html.duckduckgo.com/" safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { "q": keywords, @@ -857,7 +850,7 @@ def translate( "POST", "https://duckduckgo.com/translation.js", params=payload, - content=keywords.encode(), + data=keywords.encode(), ) if resp is None: return None diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py index ba7c084..984d12b 100644 --- a/duckduckgo_search/duckduckgo_search_async.py +++ b/duckduckgo_search/duckduckgo_search_async.py @@ -7,10 +7,10 @@ from random import choice from typing import AsyncIterator, Deque, Dict, Optional, Set, Tuple -import httpx from lxml import html +from curl_cffi import requests -from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException +from .exceptions import DuckDuckGoSearchException from .models import MapsResult from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json @@ -30,31 +30,25 @@ def __init__(self, headers=None, proxies=None, timeout=10) -> None: if headers is None: headers = HEADERS headers["User-Agent"] = choice(USERAGENTS) - self.proxies = proxies - self._client = httpx.AsyncClient(headers=headers, proxies=proxies, timeout=timeout, http2=True, verify=False) + self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies} + self._session = requests.Session( + headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110" + ) async def __aenter__(self) -> "AsyncDDGS": return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: - await self._client.aclose() + self._session.close() - async def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]: + async def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]: try: - resp = await self._client.request(method, url, follow_redirects=True, **kwargs) - if _is_500_in_url(str(resp.url)) or resp.status_code == 403: - raise APIException(f"_get_url() {url}") - if resp.status_code == 202: - raise RateLimitException(f"_get_url() {url}") + resp = self._session.request(method, url, **kwargs) + resp.raise_for_status() + if _is_500_in_url(str(resp.url)) or resp.status_code == 202: + raise if resp.status_code == 200: return resp - resp.raise_for_status() - except httpx.TimeoutException as ex: - raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}") - except (APIException, RateLimitException): - raise - except httpx.HTTPError as ex: - raise HTTPException(f"_get_url() {url} HttpError: {ex}") except Exception as ex: raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}") @@ -205,7 +199,7 @@ async def _text_html( """ assert keywords, "keywords is mandatory" - self._client.headers["Referer"] = "https://html.duckduckgo.com/" + self._session.headers["Referer"] = "https://html.duckduckgo.com/" safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { "q": keywords, @@ -856,7 +850,7 @@ async def translate( "POST", "https://duckduckgo.com/translation.js", params=payload, - content=keywords.encode(), + data=keywords.encode(), ) if resp is None: return None diff --git a/duckduckgo_search/exceptions.py b/duckduckgo_search/exceptions.py index 2177210..61acdce 100644 --- a/duckduckgo_search/exceptions.py +++ b/duckduckgo_search/exceptions.py @@ -2,21 +2,5 @@ class DuckDuckGoSearchException(Exception): """Base exception class for duckduckgo_search.""" -class APIException(DuckDuckGoSearchException): - """Exception raised for API errors.""" - - -class HTTPException(DuckDuckGoSearchException): - """Exception raised for HTTP errors.""" - - -class RateLimitException(DuckDuckGoSearchException): - """Exception raised for rate limit errors.""" - - -class TimeoutException(DuckDuckGoSearchException): - """Exception raised for timeout errors.""" - - class VQDExtractionException(DuckDuckGoSearchException): """Exception raised for error in extract vqd.""" diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py index d74168d..ce1305b 100755 --- a/duckduckgo_search/version.py +++ b/duckduckgo_search/version.py @@ -1 +1 @@ -__version__ = "3.9.11" +__version__ = "4.0.0" diff --git a/pyproject.toml b/pyproject.toml index 7b9327e..2b05a58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,10 +28,9 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ - "aiofiles>=23.2.1", "click>=8.1.7", "lxml>=4.9.3", - "httpx[http2,socks,brotli]>=0.25.1", + "curl_cffi>=0.5.10" ] dynamic = ["version"] diff --git a/requirements.txt b/requirements.txt index 23613ed..f681066 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -aiofiles>=23.2.1 click>=8.1.7 lxml>=4.9.3 -httpx[http2,socks,brotli]>=0.25.1 +curl_cffi>=0.5.10 \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py index f87faf1..70304fd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,8 +1,6 @@ import os import shutil -from time import sleep -import pytest from click.testing import CliRunner from duckduckgo_search import DDGS, __version__ @@ -11,12 +9,6 @@ runner = CliRunner() -@pytest.fixture(autouse=True) -def slow_down_tests(): - yield - sleep(2) - - def test_version_command(): result = runner.invoke(cli, ["version"]) assert result.output.strip() == __version__ diff --git a/tests/test_duckduckgo_search.py b/tests/test_duckduckgo_search.py index 0947b99..90a8562 100644 --- a/tests/test_duckduckgo_search.py +++ b/tests/test_duckduckgo_search.py @@ -1,14 +1,6 @@ -from time import sleep -import pytest from duckduckgo_search import DDGS -@pytest.fixture(autouse=True) -def slow_down_tests(): - yield - sleep(2) - - def test_text(): with DDGS() as ddgs: results = [x for x in ddgs.text("cat", max_results=30)] @@ -27,10 +19,10 @@ def test_text_html(): assert len(results) == 30 -# def test_text_lite(): -# with DDGS() as ddgs: -# results = [x for x in ddgs.text("dog", backend="lite", max_results=30)] -# assert len(results) == 30 +def test_text_lite(): + with DDGS() as ddgs: + results = [x for x in ddgs.text("dog", backend="lite", max_results=30)] + assert len(results) == 30 def test_images(): diff --git a/tests/test_duckduckgo_search_async.py b/tests/test_duckduckgo_search_async.py index e8845a8..a92f0db 100644 --- a/tests/test_duckduckgo_search_async.py +++ b/tests/test_duckduckgo_search_async.py @@ -1,15 +1,8 @@ -from time import sleep import pytest from duckduckgo_search import AsyncDDGS -@pytest.fixture(autouse=True) -def slow_down_tests(): - yield - sleep(2) - - @pytest.mark.asyncio async def test_text(): async with AsyncDDGS() as ddgs: @@ -31,11 +24,11 @@ async def test_text_html(): assert len(results) == 30 -# @pytest.mark.asyncio -# async def test_text_lite(): -# async with AsyncDDGS() as ddgs: -# results = [x async for x in ddgs.text("dog", backend="lite", max_results=30)] -# assert len(results) == 30 +@pytest.mark.asyncio +async def test_text_lite(): + async with AsyncDDGS() as ddgs: + results = [x async for x in ddgs.text("dog", backend="lite", max_results=30)] + assert len(results) == 30 @pytest.mark.asyncio