Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V3.9.7 #137

Merged
merged 4 commits into from
Nov 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.11", "3.12.0-rc.3"]
python-version: ["3.8", "3.12"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -26,17 +26,11 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install black isort ruff pytest pytest-asyncio
python -m pip install ruff pytest pytest-asyncio
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Ruff
run: |
ruff check . --target-version=py38
- name: Isort
run: |
isort duckduckgo_search --profile black --check
- name: Black
run: |
black --check --diff --target-version=py38 duckduckgo_search
ruff format . --check --target-version py38
- name: Pytest
run: |
pytest
89 changes: 43 additions & 46 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from collections import deque
from datetime import datetime
from datetime import datetime, timezone
from decimal import Decimal
from itertools import cycle
from random import choice
Expand All @@ -10,8 +10,9 @@
import httpx
from lxml import html

from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
from .models import MapsResult
from .utils import USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url
from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url

logger = logging.getLogger(__name__)

Expand All @@ -27,12 +28,9 @@ class DDGS:

def __init__(self, headers=None, proxies=None, timeout=10) -> None:
if headers is None:
headers = {
"User-Agent": choice(USERAGENTS),
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://duckduckgo.com/",
}
headers = HEADERS
headers["User-Agent"] = choice(USERAGENTS)
self.proxies = proxies
self._client = httpx.Client(headers=headers, proxies=proxies, timeout=timeout, http2=True)

def __enter__(self) -> "DDGS":
Expand All @@ -42,27 +40,32 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self._client.close()

def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Response]:
for i in range(3):
try:
resp = self._client.request(method, url, follow_redirects=True, **kwargs)
if _is_500_in_url(str(resp.url)):
raise httpx._exceptions.HTTPError("")
resp.raise_for_status()
if resp.status_code == 202:
return 202
if resp.status_code == 200:
return resp
except Exception as ex:
logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}")
if i >= 2 or "418" in str(ex):
raise ex
sleep(3)
try:
resp = self._client.request(method, url, follow_redirects=True, **kwargs)
if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
raise APIException(f"_get_url() {url} 500 in url")
if resp.status_code == 202:
raise RateLimitException(f"_get_url() {url} RateLimitError: resp.status_code==202")
if resp.status_code == 200:
return resp
resp.raise_for_status()
except httpx.TimeoutException as ex:
raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
except httpx.HTTPError as ex:
raise HTTPException(f"_get_url() {url} HttpError: {ex}")
except Exception as ex:
raise DuckDuckGoSearchException(f"_get_url() {url} {type(ex).__name__}: {ex}")

def _get_vqd(self, keywords: str) -> Optional[str]:
"""Get vqd value for a search query."""
resp = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
if resp:
return _extract_vqd(resp.content)
return _extract_vqd(resp.content, keywords)

def _sleep(self) -> None:
"""Sleep between API requests if proxies is None."""
if self.proxies is None:
sleep(0.75)

def text(
self,
Expand Down Expand Up @@ -96,10 +99,11 @@ def text(
elif backend == "lite":
results = self._text_lite(keywords, region, timelimit, max_results)

for i, result in enumerate(results, start=1):
yield result
if max_results and i >= max_results:
break
if results:
for i, result in enumerate(results, start=1):
yield result
if max_results and i >= max_results:
break

def _text_api(
self,
Expand All @@ -125,7 +129,6 @@ def _text_api(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"

payload = {
"q": keywords,
Expand All @@ -151,9 +154,7 @@ def _text_api(
resp = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
if resp is None:
return
if resp == 202:
payload["s"] = f"{int(payload['s']) + 50}"
continue

try:
page_data = resp.json().get("results", None)
except Exception:
Expand All @@ -179,6 +180,7 @@ def _text_api(
if max_results is None or result_exists is False or next_page_url is None:
return
payload["s"] = next_page_url.split("s=")[1].split("&")[0]
self._sleep()

def _text_html(
self,
Expand Down Expand Up @@ -216,9 +218,6 @@ def _text_html(
resp = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
if resp is None:
return
if resp == 202:
payload["s"] = f"{int(payload['s']) + 50}"
continue

tree = html.fromstring(resp.content)
if tree.xpath('//div[@class="no-results"]/text()'):
Expand Down Expand Up @@ -249,6 +248,7 @@ def _text_html(
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
self._sleep()

def _text_lite(
self,
Expand Down Expand Up @@ -279,20 +279,17 @@ def _text_lite(
"kl": region,
"df": timelimit,
}

cache: Set[str] = set()
for _ in range(11):
resp = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
if resp is None:
return
if resp == 202:
payload["s"] = f"{int(payload['s']) + 50}"
continue

if b"No more results." in resp.content:
return

tree = html.fromstring(resp.content)

result_exists = False
data = zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr"))
for i, e in data:
Expand Down Expand Up @@ -320,7 +317,8 @@ def _text_lite(
if not next_page_s:
return
payload["s"] = next_page_s[0]
payload["vqd"] = _extract_vqd(resp.content)
payload["vqd"] = _extract_vqd(resp.content, keywords)
self._sleep()

def images(
self,
Expand Down Expand Up @@ -361,7 +359,6 @@ def images(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": 1, "off": -1}
timelimit = f"time:{timelimit}" if timelimit else ""
Expand Down Expand Up @@ -415,6 +412,7 @@ def images(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
self._sleep()

def videos(
self,
Expand Down Expand Up @@ -446,7 +444,6 @@ def videos(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": -1, "off": -2}
timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
Expand Down Expand Up @@ -490,6 +487,7 @@ def videos(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
self._sleep()

def news(
self,
Expand All @@ -515,7 +513,6 @@ def news(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": -1, "off": -2}
payload = {
Expand Down Expand Up @@ -549,7 +546,7 @@ def news(
image_url = row.get("image", None)
result_exists = True
yield {
"date": datetime.utcfromtimestamp(row["date"]).isoformat(),
"date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
"title": row["title"],
"body": _normalize(row["excerpt"]),
"url": _normalize_url(row["url"]),
Expand All @@ -564,6 +561,7 @@ def news(
if next is None:
return
payload["s"] = next.split("s=")[-1].split("&")[0]
self._sleep()

def answers(self, keywords: str) -> Iterator[Dict[str, Optional[str]]]:
"""DuckDuckGo instant answers. Query params: https://duckduckgo.com/params
Expand Down Expand Up @@ -701,7 +699,6 @@ def maps(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"

# if longitude and latitude are specified, skip the request about bbox to the nominatim api
if latitude and longitude:
Expand Down Expand Up @@ -816,6 +813,7 @@ def maps(
bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
self._sleep()

def translate(
self, keywords: str, from_: Optional[str] = None, to: str = "en"
Expand All @@ -834,7 +832,6 @@ def translate(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd("translate")
assert vqd, "error in getting vqd"

payload = {
"vqd": vqd,
Expand Down
Loading