From edbccedaa4c8b4fe3c421d20256e08f616a8878e Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sat, 25 Nov 2023 13:01:42 +0000 Subject: [PATCH] V3.9.7 (#137) 1) add core exceptions, 2) remove retries in _get_url(), 3) raise ApiException if resp.status_code==403, 4) raise RateLimitException if resp.status_code==202, 5) sleep(0.75) between API requests if proxies is None, 6) sort imports, lint and format .py files with Ruff, 7) pytest: sleep(1) between tests. --- .github/workflows/python-package.yml | 14 +-- duckduckgo_search/duckduckgo_search.py | 89 ++++++++++---------- duckduckgo_search/duckduckgo_search_async.py | 78 ++++++++--------- duckduckgo_search/exceptions.py | 22 +++++ duckduckgo_search/utils.py | 12 ++- duckduckgo_search/version.py | 2 +- pyproject.toml | 13 +-- requirements-dev.txt | 4 +- tests/test_cli.py | 8 ++ tests/test_duckduckgo_search.py | 8 ++ tests/test_duckduckgo_search_async.py | 7 ++ 11 files changed, 143 insertions(+), 114 deletions(-) create mode 100644 duckduckgo_search/exceptions.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 04e7c1b..7d4f564 100755 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,8 +15,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.11", "3.12.0-rc.3"] - + python-version: ["3.8", "3.12"] + steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} @@ -26,17 +26,11 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install black isort ruff pytest pytest-asyncio + python -m pip install ruff pytest pytest-asyncio if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Ruff run: | - ruff check . --target-version=py38 - - name: Isort - run: | - isort duckduckgo_search --profile black --check - - name: Black - run: | - black --check --diff --target-version=py38 duckduckgo_search + ruff format . --check --target-version py38 - name: Pytest run: | pytest diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index 9d6f346..8fa2460 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -1,6 +1,6 @@ import logging from collections import deque -from datetime import datetime +from datetime import datetime, timezone from decimal import Decimal from itertools import cycle from random import choice @@ -10,8 +10,9 @@ import httpx from lxml import html +from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException from .models import MapsResult -from .utils import USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url +from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url logger = logging.getLogger(__name__) @@ -27,12 +28,9 @@ class DDGS: def __init__(self, headers=None, proxies=None, timeout=10) -> None: if headers is None: - headers = { - "User-Agent": choice(USERAGENTS), - "Accept": "application/json, text/javascript, */*; q=0.01", - "Accept-Language": "en-US,en;q=0.5", - "Referer": "https://duckduckgo.com/", - } + headers = HEADERS + headers["User-Agent"] = choice(USERAGENTS) + self.proxies = proxies self._client = httpx.Client(headers=headers, proxies=proxies, timeout=timeout, http2=True) def __enter__(self) -> "DDGS": @@ -42,27 +40,32 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None: self._client.close() def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]: - for i in range(3): - try: - resp = self._client.request(method, url, follow_redirects=True, **kwargs) - if _is_500_in_url(str(resp.url)): - raise httpx._exceptions.HTTPError("") - resp.raise_for_status() - if resp.status_code == 202: - return 202 - if resp.status_code == 200: - return resp - except Exception as ex: - logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}") - if i >= 2 or "418" in str(ex): - raise ex - sleep(3) + try: + resp = self._client.request(method, url, follow_redirects=True, **kwargs) + if _is_500_in_url(str(resp.url)) or resp.status_code == 403: + raise APIException(f"_get_url() {url} 500 in url") + if resp.status_code == 202: + raise RateLimitException(f"_get_url() {url} RateLimitError: resp.status_code==202") + if resp.status_code == 200: + return resp + resp.raise_for_status() + except httpx.TimeoutException as ex: + raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}") + except httpx.HTTPError as ex: + raise HTTPException(f"_get_url() {url} HttpError: {ex}") + except Exception as ex: + raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}") def _get_vqd(self, keywords: str) -> Optional[str]: """Get vqd value for a search query.""" resp = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords}) if resp: - return _extract_vqd(resp.content) + return _extract_vqd(resp.content, keywords) + + def _sleep(self) -> None: + """Sleep between API requests if proxies is None.""" + if self.proxies is None: + sleep(0.75) def text( self, @@ -96,10 +99,11 @@ def text( elif backend == "lite": results = self._text_lite(keywords, region, timelimit, max_results) - for i, result in enumerate(results, start=1): - yield result - if max_results and i >= max_results: - break + if results: + for i, result in enumerate(results, start=1): + yield result + if max_results and i >= max_results: + break def _text_api( self, @@ -125,7 +129,6 @@ def _text_api( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - assert vqd, "error in getting vqd" payload = { "q": keywords, @@ -151,9 +154,7 @@ def _text_api( resp = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload) if resp is None: return - if resp == 202: - payload["s"] = f"{int(payload['s']) + 50}" - continue + try: page_data = resp.json().get("results", None) except Exception: @@ -179,6 +180,7 @@ def _text_api( if max_results is None or result_exists is False or next_page_url is None: return payload["s"] = next_page_url.split("s=")[1].split("&")[0] + self._sleep() def _text_html( self, @@ -216,9 +218,6 @@ def _text_html( resp = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload) if resp is None: return - if resp == 202: - payload["s"] = f"{int(payload['s']) + 50}" - continue tree = html.fromstring(resp.content) if tree.xpath('//div[@class="no-results"]/text()'): @@ -249,6 +248,7 @@ def _text_html( names = next_page.xpath('.//input[@type="hidden"]/@name') values = next_page.xpath('.//input[@type="hidden"]/@value') payload = {n: v for n, v in zip(names, values)} + self._sleep() def _text_lite( self, @@ -279,20 +279,17 @@ def _text_lite( "kl": region, "df": timelimit, } + cache: Set[str] = set() for _ in range(11): resp = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload) if resp is None: return - if resp == 202: - payload["s"] = f"{int(payload['s']) + 50}" - continue if b"No more results." in resp.content: return tree = html.fromstring(resp.content) - result_exists = False data = zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr")) for i, e in data: @@ -320,7 +317,8 @@ def _text_lite( if not next_page_s: return payload["s"] = next_page_s[0] - payload["vqd"] = _extract_vqd(resp.content) + payload["vqd"] = _extract_vqd(resp.content, keywords) + self._sleep() def images( self, @@ -361,7 +359,6 @@ def images( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": 1, "off": -1} timelimit = f"time:{timelimit}" if timelimit else "" @@ -415,6 +412,7 @@ def images( if next is None: return payload["s"] = next.split("s=")[-1].split("&")[0] + self._sleep() def videos( self, @@ -446,7 +444,6 @@ def videos( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} timelimit = f"publishedAfter:{timelimit}" if timelimit else "" @@ -490,6 +487,7 @@ def videos( if next is None: return payload["s"] = next.split("s=")[-1].split("&")[0] + self._sleep() def news( self, @@ -515,7 +513,6 @@ def news( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { @@ -549,7 +546,7 @@ def news( image_url = row.get("image", None) result_exists = True yield { - "date": datetime.utcfromtimestamp(row["date"]).isoformat(), + "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(), "title": row["title"], "body": _normalize(row["excerpt"]), "url": _normalize_url(row["url"]), @@ -564,6 +561,7 @@ def news( if next is None: return payload["s"] = next.split("s=")[-1].split("&")[0] + self._sleep() def answers(self, keywords: str) -> Iterator[Dict[str, Optional[str]]]: """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params @@ -701,7 +699,6 @@ def maps( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - assert vqd, "error in getting vqd" # if longitude and latitude are specified, skip the request about bbox to the nominatim api if latitude and longitude: @@ -816,6 +813,7 @@ def maps( bbox3 = (lat_middle, lon_l, lat_b, lon_middle) bbox4 = (lat_middle, lon_middle, lat_b, lon_r) work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4]) + self._sleep() def translate( self, keywords: str, from_: Optional[str] = None, to: str = "en" @@ -834,7 +832,6 @@ def translate( assert keywords, "keywords is mandatory" vqd = self._get_vqd("translate") - assert vqd, "error in getting vqd" payload = { "vqd": vqd, diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py index cf812ac..814ccae 100644 --- a/duckduckgo_search/duckduckgo_search_async.py +++ b/duckduckgo_search/duckduckgo_search_async.py @@ -1,7 +1,7 @@ import asyncio import logging from collections import deque -from datetime import datetime +from datetime import datetime, timezone from decimal import Decimal from itertools import cycle from random import choice @@ -10,8 +10,9 @@ import httpx from lxml import html +from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException from .models import MapsResult -from .utils import USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url +from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url logger = logging.getLogger(__name__) @@ -27,12 +28,9 @@ class AsyncDDGS: def __init__(self, headers=None, proxies=None, timeout=10) -> None: if headers is None: - headers = { - "User-Agent": choice(USERAGENTS), - "Accept": "application/json, text/javascript, */*; q=0.01", - "Accept-Language": "en-US,en;q=0.5", - "Referer": "https://duckduckgo.com/", - } + headers = HEADERS + headers["User-Agent"] = choice(USERAGENTS) + self.proxies = proxies self._client = httpx.AsyncClient(headers=headers, proxies=proxies, timeout=timeout, http2=True) async def __aenter__(self) -> "AsyncDDGS": @@ -42,27 +40,32 @@ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: await self._client.aclose() async def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]: - for i in range(3): - try: - resp = await self._client.request(method, url, follow_redirects=True, **kwargs) - if _is_500_in_url(str(resp.url)): - raise httpx._exceptions.HTTPError("") - resp.raise_for_status() - if resp.status_code == 202: - return 202 - if resp.status_code == 200: - return resp - except Exception as ex: - logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}") - if i >= 2 or "418" in str(ex): - raise ex - await asyncio.sleep(3) + try: + resp = await self._client.request(method, url, follow_redirects=True, **kwargs) + if _is_500_in_url(str(resp.url)) or resp.status_code == 403: + raise APIException(f"_get_url() {url} 500 in url") + if resp.status_code == 202: + raise RateLimitException(f"_get_url() {url} RateLimitError: resp.status_code==202") + if resp.status_code == 200: + return resp + resp.raise_for_status() + except httpx.TimeoutException as ex: + raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}") + except httpx.HTTPError as ex: + raise HTTPException(f"_get_url() {url} HttpError: {ex}") + except Exception as ex: + raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}") async def _get_vqd(self, keywords: str) -> Optional[str]: """Get vqd value for a search query.""" resp = await self._get_url("POST", "https://duckduckgo.com", data={"q": keywords}) if resp: - return _extract_vqd(resp.content) + return _extract_vqd(resp.content, keywords) + + async def _sleep(self) -> None: + """Sleep between API requests if proxies is None.""" + if self.proxies is None: + asyncio.sleep(0.75) async def text( self, @@ -127,7 +130,6 @@ async def _text_api( assert keywords, "keywords is mandatory" vqd = await self._get_vqd(keywords) - assert vqd, "error in getting vqd" payload = { "q": keywords, @@ -153,9 +155,7 @@ async def _text_api( resp = await self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload) if resp is None: return - if resp == 202: - payload["s"] = f"{int(payload['s']) + 50}" - continue + try: page_data = resp.json().get("results", None) except Exception: @@ -181,6 +181,7 @@ async def _text_api( if max_results is None or result_exists is False or next_page_url is None: return payload["s"] = next_page_url.split("s=")[1].split("&")[0] + await self._sleep() async def _text_html( self, @@ -218,9 +219,6 @@ async def _text_html( resp = await self._get_url("POST", "https://html.duckduckgo.com/html", data=payload) if resp is None: return - if resp == 202: - payload["s"] = f"{int(payload['s']) + 50}" - continue tree = html.fromstring(resp.content) if tree.xpath('//div[@class="no-results"]/text()'): @@ -251,6 +249,7 @@ async def _text_html( names = next_page.xpath('.//input[@type="hidden"]/@name') values = next_page.xpath('.//input[@type="hidden"]/@value') payload = {n: v for n, v in zip(names, values)} + await self._sleep() async def _text_lite( self, @@ -286,9 +285,6 @@ async def _text_lite( resp = await self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload) if resp is None: return - if resp == 202: - payload["s"] = f"{int(payload['s']) + 50}" - continue if b"No more results." in resp.content: return @@ -322,7 +318,8 @@ async def _text_lite( if not next_page_s: return payload["s"] = next_page_s[0] - payload["vqd"] = _extract_vqd(resp.content) + payload["vqd"] = _extract_vqd(resp.content, keywords) + await self._sleep() async def images( self, @@ -363,7 +360,6 @@ async def images( assert keywords, "keywords is mandatory" vqd = await self._get_vqd(keywords) - assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": 1, "off": -1} timelimit = f"time:{timelimit}" if timelimit else "" @@ -417,6 +413,7 @@ async def images( if next is None: return payload["s"] = next.split("s=")[-1].split("&")[0] + await self._sleep() async def videos( self, @@ -448,7 +445,6 @@ async def videos( assert keywords, "keywords is mandatory" vqd = await self._get_vqd(keywords) - assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} timelimit = f"publishedAfter:{timelimit}" if timelimit else "" @@ -492,6 +488,7 @@ async def videos( if next is None: return payload["s"] = next.split("s=")[-1].split("&")[0] + await self._sleep() async def news( self, @@ -517,7 +514,6 @@ async def news( assert keywords, "keywords is mandatory" vqd = await self._get_vqd(keywords) - assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { @@ -551,7 +547,7 @@ async def news( image_url = row.get("image", None) result_exists = True yield { - "date": datetime.utcfromtimestamp(row["date"]).isoformat(), + "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(), "title": row["title"], "body": _normalize(row["excerpt"]), "url": _normalize_url(row["url"]), @@ -566,6 +562,7 @@ async def news( if next is None: return payload["s"] = next.split("s=")[-1].split("&")[0] + await self._sleep() async def answers(self, keywords: str) -> AsyncIterator[Dict[str, Optional[str]]]: """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params @@ -703,7 +700,6 @@ async def maps( assert keywords, "keywords is mandatory" vqd = await self._get_vqd(keywords) - assert vqd, "error in getting vqd" # if longitude and latitude are specified, skip the request about bbox to the nominatim api if latitude and longitude: @@ -818,6 +814,7 @@ async def maps( bbox3 = (lat_middle, lon_l, lat_b, lon_middle) bbox4 = (lat_middle, lon_middle, lat_b, lon_r) work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4]) + await self._sleep() async def translate( self, keywords: str, from_: Optional[str] = None, to: str = "en" @@ -836,7 +833,6 @@ async def translate( assert keywords, "keywords is mandatory" vqd = await self._get_vqd("translate") - assert vqd, "error in getting vqd" payload = { "vqd": vqd, diff --git a/duckduckgo_search/exceptions.py b/duckduckgo_search/exceptions.py new file mode 100644 index 0000000..2177210 --- /dev/null +++ b/duckduckgo_search/exceptions.py @@ -0,0 +1,22 @@ +class DuckDuckGoSearchException(Exception): + """Base exception class for duckduckgo_search.""" + + +class APIException(DuckDuckGoSearchException): + """Exception raised for API errors.""" + + +class HTTPException(DuckDuckGoSearchException): + """Exception raised for HTTP errors.""" + + +class RateLimitException(DuckDuckGoSearchException): + """Exception raised for rate limit errors.""" + + +class TimeoutException(DuckDuckGoSearchException): + """Exception raised for timeout errors.""" + + +class VQDExtractionException(DuckDuckGoSearchException): + """Exception raised for error in extract vqd.""" diff --git a/duckduckgo_search/utils.py b/duckduckgo_search/utils.py index 5053555..4fd90dd 100644 --- a/duckduckgo_search/utils.py +++ b/duckduckgo_search/utils.py @@ -3,9 +3,18 @@ from typing import Optional from urllib.parse import unquote +from .exceptions import VQDExtractionException + + REGEX_500_IN_URL = re.compile(r"(?:\d{3}-\d{2}\.js)") REGEX_STRIP_TAGS = re.compile("<.*?>") +HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Referer": "https://duckduckgo.com/", +} USERAGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", @@ -14,7 +23,7 @@ ] -def _extract_vqd(html_bytes: bytes) -> Optional[str]: +def _extract_vqd(html_bytes: bytes, keywords: str) -> Optional[str]: for c1, c2 in ( (b'vqd="', b'"'), (b"vqd=", b"&"), @@ -26,6 +35,7 @@ def _extract_vqd(html_bytes: bytes) -> Optional[str]: return html_bytes[start:end].decode() except ValueError: pass + raise VQDExtractionException(f"Could not extract vqd. {keywords=}") def _is_500_in_url(url: str) -> bool: diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py index 481f569..cdd6123 100755 --- a/duckduckgo_search/version.py +++ b/duckduckgo_search/version.py @@ -1 +1 @@ -__version__ = "3.9.6" +__version__ = "3.9.7" diff --git a/pyproject.toml b/pyproject.toml index 0f11e9b..7b9327e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,22 +46,11 @@ version = {attr = "duckduckgo_search.version.__version__"} [project.optional-dependencies] dev = [ - "black>=23.9.1", - "isort>=5.12.0", - "ruff>=0.0.291", + "ruff>=0.1.6", "pytest>=7.4.2", "pytest-asyncio>=0.21.1", ] -[tool.black] -line-length = 120 -target-version = ["py38"] - -[tool.isort] -atomic = true -profile = "black" -line_length = 120 - [tool.ruff] line-length = 120 target-version = "py38" diff --git a/requirements-dev.txt b/requirements-dev.txt index 5b5cbf0..264a242 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,3 @@ -black>=23.9.1 -isort>=5.12.0 -ruff>=0.0.291 +ruff>=0.1.6 pytest>=7.4.2 pytest-asyncio>=0.21.1 diff --git a/tests/test_cli.py b/tests/test_cli.py index 70304fd..fa4c8f2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,8 @@ import os import shutil +from time import sleep +import pytest from click.testing import CliRunner from duckduckgo_search import DDGS, __version__ @@ -9,6 +11,12 @@ runner = CliRunner() +@pytest.fixture(autouse=True) +def slow_down_tests(): + yield + sleep(1) + + def test_version_command(): result = runner.invoke(cli, ["version"]) assert result.output.strip() == __version__ diff --git a/tests/test_duckduckgo_search.py b/tests/test_duckduckgo_search.py index 90a8562..4e6ebab 100644 --- a/tests/test_duckduckgo_search.py +++ b/tests/test_duckduckgo_search.py @@ -1,6 +1,14 @@ +from time import sleep +import pytest from duckduckgo_search import DDGS +@pytest.fixture(autouse=True) +def slow_down_tests(): + yield + sleep(1) + + def test_text(): with DDGS() as ddgs: results = [x for x in ddgs.text("cat", max_results=30)] diff --git a/tests/test_duckduckgo_search_async.py b/tests/test_duckduckgo_search_async.py index a92f0db..50681dc 100644 --- a/tests/test_duckduckgo_search_async.py +++ b/tests/test_duckduckgo_search_async.py @@ -1,8 +1,15 @@ +from time import sleep import pytest from duckduckgo_search import AsyncDDGS +@pytest.fixture(autouse=True) +def slow_down_tests(): + yield + sleep(1) + + @pytest.mark.asyncio async def test_text(): async with AsyncDDGS() as ddgs: