V3.9.7 (#137)

1) add core exceptions, 2) remove retries in _get_url(), 3) raise ApiException if resp.status_code==403, 4) raise RateLimitException if resp.status_code==202, 5) sleep(0.75) between API requests if proxies is None, 6) sort imports, lint and format .py files with Ruff, 7) pytest: sleep(1) between tests.
deedy5 · Nov 25, 2023 · edbcced · edbcced
1 parent c6e70e1
commit edbcced
Show file tree

Hide file tree

Showing 11 changed files with 143 additions and 114 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -15,8 +15,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.11", "3.12.0-rc.3"]
-        
+        python-version: ["3.8", "3.12"]
+
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
@@ -26,17 +26,11 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install black isort ruff pytest pytest-asyncio
+        python -m pip install ruff pytest pytest-asyncio
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Ruff
       run: |
-        ruff check . --target-version=py38
-    - name: Isort
-      run: |
-        isort duckduckgo_search --profile black --check
-    - name: Black
-      run: |
-        black --check --diff --target-version=py38 duckduckgo_search
+        ruff format . --check --target-version py38
     - name: Pytest
       run: |
         pytest
diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -1,6 +1,6 @@
 import logging
 from collections import deque
-from datetime import datetime
+from datetime import datetime, timezone
 from decimal import Decimal
 from itertools import cycle
 from random import choice
@@ -10,8 +10,9 @@
 import httpx
 from lxml import html
 
+from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
 from .models import MapsResult
-from .utils import USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url
+from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url
 
 logger = logging.getLogger(__name__)
 
@@ -27,12 +28,9 @@ class DDGS:
 
     def __init__(self, headers=None, proxies=None, timeout=10) -> None:
         if headers is None:
-            headers = {
-                "User-Agent": choice(USERAGENTS),
-                "Accept": "application/json, text/javascript, */*; q=0.01",
-                "Accept-Language": "en-US,en;q=0.5",
-                "Referer": "https://duckduckgo.com/",
-            }
+            headers = HEADERS
+            headers["User-Agent"] = choice(USERAGENTS)
+        self.proxies = proxies
         self._client = httpx.Client(headers=headers, proxies=proxies, timeout=timeout, http2=True)
 
     def __enter__(self) -> "DDGS":
@@ -42,27 +40,32 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
         self._client.close()
 
     def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]:
-        for i in range(3):
-            try:
-                resp = self._client.request(method, url, follow_redirects=True, **kwargs)
-                if _is_500_in_url(str(resp.url)):
-                    raise httpx._exceptions.HTTPError("")
-                resp.raise_for_status()
-                if resp.status_code == 202:
-                    return 202
-                if resp.status_code == 200:
-                    return resp
-            except Exception as ex:
-                logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}")
-                if i >= 2 or "418" in str(ex):
-                    raise ex
-            sleep(3)
+        try:
+            resp = self._client.request(method, url, follow_redirects=True, **kwargs)
+            if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
+                raise APIException(f"_get_url() {url} 500 in url")
+            if resp.status_code == 202:
+                raise RateLimitException(f"_get_url() {url} RateLimitError: resp.status_code==202")
+            if resp.status_code == 200:
+                return resp
+            resp.raise_for_status()
+        except httpx.TimeoutException as ex:
+            raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
+        except httpx.HTTPError as ex:
+            raise HTTPException(f"_get_url() {url} HttpError: {ex}")
+        except Exception as ex:
+            raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}")
 
     def _get_vqd(self, keywords: str) -> Optional[str]:
         """Get vqd value for a search query."""
         resp = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
         if resp:
-            return _extract_vqd(resp.content)
+            return _extract_vqd(resp.content, keywords)
+
+    def _sleep(self) -> None:
+        """Sleep between API requests if proxies is None."""
+        if self.proxies is None:
+            sleep(0.75)
 
     def text(
         self,
@@ -96,10 +99,11 @@ def text(
         elif backend == "lite":
             results = self._text_lite(keywords, region, timelimit, max_results)
 
-        for i, result in enumerate(results, start=1):
-            yield result
-            if max_results and i >= max_results:
-                break
+        if results:
+            for i, result in enumerate(results, start=1):
+                yield result
+                if max_results and i >= max_results:
+                    break
 
     def _text_api(
         self,
@@ -125,7 +129,6 @@ def _text_api(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        assert vqd, "error in getting vqd"
 
         payload = {
             "q": keywords,
@@ -151,9 +154,7 @@ def _text_api(
             resp = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
             if resp is None:
                 return
-            if resp == 202:
-                payload["s"] = f"{int(payload['s']) + 50}"
-                continue
+
             try:
                 page_data = resp.json().get("results", None)
             except Exception:
@@ -179,6 +180,7 @@ def _text_api(
             if max_results is None or result_exists is False or next_page_url is None:
                 return
             payload["s"] = next_page_url.split("s=")[1].split("&")[0]
+            self._sleep()
 
     def _text_html(
         self,
@@ -216,9 +218,6 @@ def _text_html(
             resp = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
             if resp is None:
                 return
-            if resp == 202:
-                payload["s"] = f"{int(payload['s']) + 50}"
-                continue
 
             tree = html.fromstring(resp.content)
             if tree.xpath('//div[@class="no-results"]/text()'):
@@ -249,6 +248,7 @@ def _text_html(
             names = next_page.xpath('.//input[@type="hidden"]/@name')
             values = next_page.xpath('.//input[@type="hidden"]/@value')
             payload = {n: v for n, v in zip(names, values)}
+            self._sleep()
 
     def _text_lite(
         self,
@@ -279,20 +279,17 @@ def _text_lite(
             "kl": region,
             "df": timelimit,
         }
+
         cache: Set[str] = set()
         for _ in range(11):
             resp = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
             if resp is None:
                 return
-            if resp == 202:
-                payload["s"] = f"{int(payload['s']) + 50}"
-                continue
 
             if b"No more results." in resp.content:
                 return
 
             tree = html.fromstring(resp.content)
-
             result_exists = False
             data = zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr"))
             for i, e in data:
@@ -320,7 +317,8 @@ def _text_lite(
             if not next_page_s:
                 return
             payload["s"] = next_page_s[0]
-            payload["vqd"] = _extract_vqd(resp.content)
+            payload["vqd"] = _extract_vqd(resp.content, keywords)
+            self._sleep()
 
     def images(
         self,
@@ -361,7 +359,6 @@ def images(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": 1, "off": -1}
         timelimit = f"time:{timelimit}" if timelimit else ""
@@ -415,6 +412,7 @@ def images(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
+            self._sleep()
 
     def videos(
         self,
@@ -446,7 +444,6 @@ def videos(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
@@ -490,6 +487,7 @@ def videos(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
+            self._sleep()
 
     def news(
         self,
@@ -515,7 +513,6 @@ def news(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
@@ -549,7 +546,7 @@ def news(
                     image_url = row.get("image", None)
                     result_exists = True
                     yield {
-                        "date": datetime.utcfromtimestamp(row["date"]).isoformat(),
+                        "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
                         "title": row["title"],
                         "body": _normalize(row["excerpt"]),
                         "url": _normalize_url(row["url"]),
@@ -564,6 +561,7 @@ def news(
             if next is None:
                 return
             payload["s"] = next.split("s=")[-1].split("&")[0]
+            self._sleep()
 
     def answers(self, keywords: str) -> Iterator[Dict[str, Optional[str]]]:
         """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params
@@ -701,7 +699,6 @@ def maps(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        assert vqd, "error in getting vqd"
 
         # if longitude and latitude are specified, skip the request about bbox to the nominatim api
         if latitude and longitude:
@@ -816,6 +813,7 @@ def maps(
                 bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
                 bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
                 work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
+            self._sleep()
 
     def translate(
         self, keywords: str, from_: Optional[str] = None, to: str = "en"
@@ -834,7 +832,6 @@ def translate(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd("translate")
-        assert vqd, "error in getting vqd"
 
         payload = {
             "vqd": vqd,