Skip to content

Commit

Permalink
V4.0.0 (#159)
Browse files Browse the repository at this point in the history
1. migrate from httpx to curl_cffi
2. CURL: simplified downloads, use ThreadPoolExecutor instead of asyncio
3. exceptions: simplified, use DuckDuckGoSearchException and VQDExtractionException
4. github workflow: added windows and macos
5. tests:  removed delays between tests, activated tests for text(backend="lite")
6. CLI: save_json() - set encoding="utf-8" (bugfix for windows)
  • Loading branch information
deedy5 authored Dec 12, 2023
1 parent 99dbfa0 commit 8211fe6
Show file tree
Hide file tree
Showing 13 changed files with 81 additions and 150 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ on:
jobs:
build:

runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}

strategy:
matrix:
python-version: ["3.8", "3.12"]
os: [ubuntu-latest, macos-latest, windows-latest]

steps:
- uses: actions/checkout@v3
Expand All @@ -25,9 +27,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install ruff pytest pytest-asyncio
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
python -m pip install .[dev]
- name: Ruff
run: |
ruff format . --check --target-version py38
Expand Down
8 changes: 1 addition & 7 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# vqd_cache
/vqd_cache

# vscode
.vscode
#.idea/
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,21 @@ python -m duckduckgo_search --help
CLI examples:
```python3
# text search
ddgs text -k 'ayrton senna'
ddgs text -k "ayrton senna"
# text search via proxy (example: Tor Browser)
ddgs text -k 'china is a global threat' -p socks5://localhost:9150
ddgs text -k "china is a global threat" -p socks5://localhost:9150
# find and download pdf files
ddgs text -k "russia filetype:pdf" -m 50 -d
# find in es-es region and download pdf files via proxy (example: Tor browser)
ddgs text -k "embajada a tamorlán filetype:pdf" -r es-es -m 50 -d -p socks5://localhost:9150
# find and download xls files from a specific site
ddgs text -k 'sanctions filetype:xls site:gov.ua' -m 50 -d
ddgs text -k "sanctions filetype:xls site:gov.ua" -m 50 -d
# find and download any doc(x) files from a specific site
ddgs text -k 'filetype:doc site:mos.ru' -m 50 -d
ddgs text -k "filetype:doc site:mos.ru" -m 50 -d
# find and download images
ddgs images -k "yuri kuklachev cat theatre" -m 500 -s off -d
# find in br-br region and download images via proxy (example: Tor browser) in 10 threads
ddgs images -k 'rio carnival' -r br-br -s off -m 500 -d -th 10 -p socks5://localhost:9150
ddgs images -k "rio carnival" -r br-br -s off -m 500 -d -th 10 -p socks5://localhost:9150
# get latest news
ddgs news -k "ukraine war" -s off -t d -m 10
# get last day's news and save it to a csv file
Expand Down
71 changes: 31 additions & 40 deletions duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import asyncio
import csv
import json
import logging
import os
import ssl
from concurrent.futures import as_completed, ThreadPoolExecutor
from datetime import datetime
from random import choice
from urllib.parse import unquote

import aiofiles
import click
import httpx
from curl_cffi import requests

from .duckduckgo_search import DDGS, USERAGENTS
from .version import __version__
Expand All @@ -36,7 +34,7 @@


def save_json(jsonfile, data):
with open(jsonfile, "w") as file:
with open(jsonfile, "w", encoding="utf-8") as file:
json.dump(data, file, ensure_ascii=False, indent=4)


Expand Down Expand Up @@ -80,47 +78,40 @@ def sanitize_keywords(keywords):
return keywords


async def download_file(url, dir_path, filename, sem, proxy):
def download_file(url, dir_path, filename, proxy):
headers = {"User-Agent": choice(USERAGENTS)}
for i in range(2):
try:
async with sem, httpx.AsyncClient(headers=headers, proxies=proxy, timeout=10) as client:
async with client.stream("GET", url) as resp:
resp.raise_for_status()
async with aiofiles.open(os.path.join(dir_path, filename[:200]), "wb") as file:
async for chunk in resp.aiter_bytes():
await file.write(chunk)
break
except (httpx.HTTPError, ssl.SSLCertVerificationError, ssl.SSLError) as ex:
logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
except ValueError as ex:
raise ex


async def _download_results(keywords, results, images=False, proxy=None, threads=None):
try:
with requests.Session(headers=headers, proxies=proxy, impersonate="chrome110") as session:
resp = session.get(url, stream=True)
resp.raise_for_status()
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
for chunk in resp.iter_content():
file.write(chunk)
except Exception as ex:
logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")


def download_results(keywords, results, images=False, proxy=None, threads=None):
path_type = "images" if images else "text"
path = f"{path_type}_{keywords}_{datetime.now():%Y%m%d_%H%M%S}"
os.makedirs(path, exist_ok=True)
proxy = {"http": proxy, "https": proxy}

threads = 10 if threads is None else threads
sem = asyncio.Semaphore(threads)
tasks = []
for i, res in enumerate(results, start=1):
url = res["image"] if images else res["href"]
filename = unquote(url.split("/")[-1].split("?")[0])
task = asyncio.create_task(download_file(url, path, f"{i}_{filename}", sem, proxy))
tasks.append(task)

with click.progressbar(length=len(tasks), label="Downloading", show_percent=True, show_pos=True, width=50) as bar:
for future in asyncio.as_completed(tasks):
await future
bar.update(1)

await asyncio.gather(*tasks)


def download_results(keywords, results, images=False, proxy=None, threads=None):
asyncio.run(_download_results(keywords, results, images, proxy))
with ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for i, res in enumerate(results, start=1):
url = res["image"] if images else res["href"]
filename = unquote(url.split("/")[-1].split("?")[0])
f = executor.submit(download_file, url, path, f"{i}_{filename}", proxy)
futures.append(f)

with click.progressbar(
length=len(futures), label="Downloading", show_percent=True, show_pos=True, width=50
) as bar:
for future in as_completed(futures):
future.result()
bar.update(1)


@click.group(chain=True)
Expand Down
35 changes: 14 additions & 21 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import logging
from collections import deque
from datetime import datetime, timezone
Expand All @@ -8,10 +7,10 @@
from time import sleep
from typing import Deque, Dict, Iterator, Optional, Set, Tuple

import httpx
from lxml import html
from curl_cffi import requests

from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
from .exceptions import DuckDuckGoSearchException
from .models import MapsResult
from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json

Expand All @@ -31,31 +30,25 @@ def __init__(self, headers=None, proxies=None, timeout=10) -> None:
if headers is None:
headers = HEADERS
headers["User-Agent"] = choice(USERAGENTS)
self.proxies = proxies
self._client = httpx.Client(headers=headers, proxies=proxies, timeout=timeout, http2=True, verify=False)
self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
self._session = requests.Session(
headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
)

def __enter__(self) -> "DDGS":
return self

def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self._client.close()
self._session.close()

def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]:
def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
try:
resp = self._client.request(method, url, follow_redirects=True, **kwargs)
if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
raise APIException(f"_get_url() {url}")
if resp.status_code == 202:
raise RateLimitException(f"_get_url() {url}")
resp = self._session.request(method, url, **kwargs)
resp.raise_for_status()
if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
raise
if resp.status_code == 200:
return resp
resp.raise_for_status()
except httpx.TimeoutException as ex:
raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
except (APIException, RateLimitException):
raise
except httpx.HTTPError as ex:
raise HTTPException(f"_get_url() {url} HttpError: {ex}")
except Exception as ex:
raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}")

Expand Down Expand Up @@ -205,7 +198,7 @@ def _text_html(
"""
assert keywords, "keywords is mandatory"

self._client.headers["Referer"] = "https://html.duckduckgo.com/"
self._session.headers["Referer"] = "https://html.duckduckgo.com/"
safesearch_base = {"on": 1, "moderate": -1, "off": -2}
payload = {
"q": keywords,
Expand Down Expand Up @@ -857,7 +850,7 @@ def translate(
"POST",
"https://duckduckgo.com/translation.js",
params=payload,
content=keywords.encode(),
data=keywords.encode(),
)
if resp is None:
return None
Expand Down
34 changes: 14 additions & 20 deletions duckduckgo_search/duckduckgo_search_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from random import choice
from typing import AsyncIterator, Deque, Dict, Optional, Set, Tuple

import httpx
from lxml import html
from curl_cffi import requests

from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
from .exceptions import DuckDuckGoSearchException
from .models import MapsResult
from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json

Expand All @@ -30,31 +30,25 @@ def __init__(self, headers=None, proxies=None, timeout=10) -> None:
if headers is None:
headers = HEADERS
headers["User-Agent"] = choice(USERAGENTS)
self.proxies = proxies
self._client = httpx.AsyncClient(headers=headers, proxies=proxies, timeout=timeout, http2=True, verify=False)
self.proxies = proxies if proxies and isinstance(proxies, dict) else {"http": proxies, "https": proxies}
self._session = requests.Session(
headers=headers, proxies=self.proxies, timeout=timeout, http_version=2, impersonate="chrome110"
)

async def __aenter__(self) -> "AsyncDDGS":
return self

async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self._client.aclose()
self._session.close()

async def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]:
async def _get_url(self, method: str, url: str, **kwargs) -> Optional[requests.Response]:
try:
resp = await self._client.request(method, url, follow_redirects=True, **kwargs)
if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
raise APIException(f"_get_url() {url}")
if resp.status_code == 202:
raise RateLimitException(f"_get_url() {url}")
resp = self._session.request(method, url, **kwargs)
resp.raise_for_status()
if _is_500_in_url(str(resp.url)) or resp.status_code == 202:
raise
if resp.status_code == 200:
return resp
resp.raise_for_status()
except httpx.TimeoutException as ex:
raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
except (APIException, RateLimitException):
raise
except httpx.HTTPError as ex:
raise HTTPException(f"_get_url() {url} HttpError: {ex}")
except Exception as ex:
raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}")

Expand Down Expand Up @@ -205,7 +199,7 @@ async def _text_html(
"""
assert keywords, "keywords is mandatory"

self._client.headers["Referer"] = "https://html.duckduckgo.com/"
self._session.headers["Referer"] = "https://html.duckduckgo.com/"
safesearch_base = {"on": 1, "moderate": -1, "off": -2}
payload = {
"q": keywords,
Expand Down Expand Up @@ -856,7 +850,7 @@ async def translate(
"POST",
"https://duckduckgo.com/translation.js",
params=payload,
content=keywords.encode(),
data=keywords.encode(),
)
if resp is None:
return None
Expand Down
16 changes: 0 additions & 16 deletions duckduckgo_search/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,5 @@ class DuckDuckGoSearchException(Exception):
"""Base exception class for duckduckgo_search."""


class APIException(DuckDuckGoSearchException):
"""Exception raised for API errors."""


class HTTPException(DuckDuckGoSearchException):
"""Exception raised for HTTP errors."""


class RateLimitException(DuckDuckGoSearchException):
"""Exception raised for rate limit errors."""


class TimeoutException(DuckDuckGoSearchException):
"""Exception raised for timeout errors."""


class VQDExtractionException(DuckDuckGoSearchException):
"""Exception raised for error in extract vqd."""
2 changes: 1 addition & 1 deletion duckduckgo_search/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.9.11"
__version__ = "4.0.0"
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,9 @@ classifiers = [
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = [
"aiofiles>=23.2.1",
"click>=8.1.7",
"lxml>=4.9.3",
"httpx[http2,socks,brotli]>=0.25.1",
"curl_cffi>=0.5.10"
]
dynamic = ["version"]

Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
aiofiles>=23.2.1
click>=8.1.7
lxml>=4.9.3
httpx[http2,socks,brotli]>=0.25.1
curl_cffi>=0.5.10
8 changes: 0 additions & 8 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os
import shutil
from time import sleep

import pytest
from click.testing import CliRunner

from duckduckgo_search import DDGS, __version__
Expand All @@ -11,12 +9,6 @@
runner = CliRunner()


@pytest.fixture(autouse=True)
def slow_down_tests():
yield
sleep(2)


def test_version_command():
result = runner.invoke(cli, ["version"])
assert result.output.strip() == __version__
Expand Down
Loading

0 comments on commit 8211fe6

Please sign in to comment.