Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DDGS.text]: add backend="auto", [DDGS.images|news|videos]: remove multithreading #269

Merged
merged 7 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ AI chat and search for text, news, images and videos using the DuckDuckGo.com se
```python
pip install -U duckduckgo_search
```
> [!NOTE]
> you can install lxml to use the `text` function with `backend='html'` or `backend='lite'` (size ≈ 12Mb)</br>
> `pip install -U duckduckgo_search[lxml]`

## CLI version

Expand Down Expand Up @@ -235,7 +232,7 @@ def text(
region: str = "wt-wt",
safesearch: str = "moderate",
timelimit: str | None = None,
backend: str = "api",
backend: str = "auto",
max_results: int | None = None,
) -> list[dict[str, str]]:
"""DuckDuckGo text search generator. Query params: https://duckduckgo.com/params.
Expand All @@ -245,7 +242,8 @@ def text(
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
safesearch: on, moderate, off. Defaults to "moderate".
timelimit: d, w, m, y. Defaults to None.
backend: api, html, lite. Defaults to api.
backend: auto, api, html, lite. Defaults to auto.
auto - try all backends in random order,
api - collect data from https://duckduckgo.com,
html - collect data from https://html.duckduckgo.com,
lite - collect data from https://lite.duckduckgo.com.
Expand Down
2 changes: 1 addition & 1 deletion duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def chat(load, proxy, multiline, timeout, verify, model):
@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
@click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
@click.option("-dd", "--download-directory", help="Specify custom download directory")
@click.option("-b", "--backend", default="api", type=click.Choice(["api", "html", "lite"]), help="which backend to use")
@click.option("-b", "--backend", default="auto", type=click.Choice(["auto", "api", "html", "lite"]))
@click.option("-th", "--threads", default=10, help="download threads, default=10")
@click.option("-p", "--proxy", help="the proxy to send requests, example: socks5://127.0.0.1:9150")
@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
Expand Down
138 changes: 60 additions & 78 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,18 @@
import logging
import os
import warnings
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from functools import cached_property
from itertools import cycle, islice
from random import choice
from itertools import cycle
from random import choice, shuffle
from time import sleep, time
from types import TracebackType
from typing import cast

import primp # type: ignore

try:
from lxml.etree import _Element
from lxml.html import HTMLParser as LHTMLParser
from lxml.html import document_fromstring

LXML_AVAILABLE = True
except ImportError:
LXML_AVAILABLE = False
from lxml.etree import _Element
from lxml.html import HTMLParser as LHTMLParser
from lxml.html import document_fromstring

from .exceptions import ConversationLimitException, DuckDuckGoSearchException, RatelimitException, TimeoutException
from .utils import (
Expand All @@ -39,7 +32,6 @@
class DDGS:
"""DuckDuckgo_search class to get search results from duckduckgo.com."""

_executor: ThreadPoolExecutor = ThreadPoolExecutor()
_impersonates = (
"chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", "chrome_108",
"chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119", "chrome_120",
Expand Down Expand Up @@ -215,7 +207,7 @@ def text(
region: str = "wt-wt",
safesearch: str = "moderate",
timelimit: str | None = None,
backend: str = "api",
backend: str = "auto",
max_results: int | None = None,
) -> list[dict[str, str]]:
"""DuckDuckGo text search. Query params: https://duckduckgo.com/params.
Expand All @@ -225,7 +217,8 @@ def text(
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
safesearch: on, moderate, off. Defaults to "moderate".
timelimit: d, w, m, y. Defaults to None.
backend: api, html, lite. Defaults to api.
backend: auto, api, html, lite. Defaults to auto.
auto - try all backends in random order,
api - collect data from https://duckduckgo.com,
html - collect data from https://html.duckduckgo.com,
lite - collect data from https://lite.duckduckgo.com.
Expand All @@ -239,17 +232,25 @@ def text(
RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
"""
if LXML_AVAILABLE is False and backend != "api":
backend = "api"
warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)

if backend == "api":
results = self._text_api(keywords, region, safesearch, timelimit, max_results)
elif backend == "html":
results = self._text_html(keywords, region, timelimit, max_results)
elif backend == "lite":
results = self._text_lite(keywords, region, timelimit, max_results)
return results

backends = ["api", "html", "lite"] if backend == "auto" else [backend]
shuffle(backends)

results, err = [], None
for b in backends:
try:
if b == "api":
results = self._text_api(keywords, region, safesearch, timelimit, max_results)
elif b == "html":
results = self._text_html(keywords, region, timelimit, max_results)
elif b == "lite":
results = self._text_lite(keywords, region, timelimit, max_results)
return results
except Exception as ex:
logger.info(f"Error to search using {b} backend: {ex}")
err = ex

raise DuckDuckGoSearchException(err)

def _text_api(
self,
Expand Down Expand Up @@ -298,7 +299,7 @@ def _text_api(
return results
else:
next_page_url = row.get("n")
if not next_page_url:
if not next_page_url or not max_results:
return results
payload["s"] = next_page_url.split("s=")[1].split("&")[0]
return results
Expand Down Expand Up @@ -364,7 +365,7 @@ def _text_html(
return results

npx = tree.xpath('.//div[@class="nav-link"]')
if not npx:
if not npx or not max_results:
return results
next_page = npx[-1] if isinstance(npx, list) else None
if isinstance(next_page, _Element):
Expand Down Expand Up @@ -446,7 +447,7 @@ def _text_lite(
return results

next_page_s = tree.xpath("//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value")
if not next_page_s:
if not next_page_s or not max_results:
return results
elif isinstance(next_page_s, list):
payload["s"] = str(next_page_s[0])
Expand Down Expand Up @@ -516,13 +517,11 @@ def images(
cache = set()
results: list[dict[str, str]] = []

def _images_page(s: int) -> list[dict[str, str]]:
payload["s"] = f"{s}"
for _ in range(5):
resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
resp_json = json_loads(resp_content)

page_data = resp_json.get("results", [])
page_results = []

for row in page_data:
image_url = row.get("image")
if image_url and image_url not in cache:
Expand All @@ -536,20 +535,15 @@ def _images_page(s: int) -> list[dict[str, str]]:
"width": row["width"],
"source": row["source"],
}
page_results.append(result)
return page_results

slist = [0]
if max_results:
max_results = min(max_results, 500)
slist.extend(range(100, max_results, 100))
try:
for r in self._executor.map(_images_page, slist):
results.extend(r)
except Exception as e:
raise e
results.append(result)
if max_results and len(results) >= max_results:
return results
next = resp_json.get("next")
if next is None or not max_results:
return results
payload["s"] = next.split("s=")[-1].split("&")[0]

return list(islice(results, max_results))
return results

def videos(
self,
Expand Down Expand Up @@ -603,30 +597,23 @@ def videos(
cache = set()
results: list[dict[str, str]] = []

def _videos_page(s: int) -> list[dict[str, str]]:
payload["s"] = f"{s}"
for _ in range(8):
resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
resp_json = json_loads(resp_content)

page_data = resp_json.get("results", [])
page_results = []

for row in page_data:
if row["content"] not in cache:
cache.add(row["content"])
page_results.append(row)
return page_results

slist = [0]
if max_results:
max_results = min(max_results, 200)
slist.extend(range(60, max_results, 60))
try:
for r in self._executor.map(_videos_page, slist):
results.extend(r)
except Exception as e:
raise e
results.append(row)
if max_results and len(results) >= max_results:
return results
next = resp_json.get("next")
if next is None or not max_results:
return results
payload["s"] = next.split("s=")[-1].split("&")[0]

return list(islice(results, max_results))
return results

def news(
self,
Expand Down Expand Up @@ -672,12 +659,11 @@ def news(
cache = set()
results: list[dict[str, str]] = []

def _news_page(s: int) -> list[dict[str, str]]:
payload["s"] = f"{s}"
for _ in range(5):
resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
resp_json = json_loads(resp_content)
page_data = resp_json.get("results", [])
page_results = []

for row in page_data:
if row["url"] not in cache:
cache.add(row["url"])
Expand All @@ -690,17 +676,13 @@ def _news_page(s: int) -> list[dict[str, str]]:
"image": _normalize_url(image_url),
"source": row["source"],
}
page_results.append(result)
return page_results
results.append(result)
if max_results and len(results) >= max_results:
return results

slist = [0]
if max_results:
max_results = min(max_results, 120)
slist.extend(range(30, max_results, 30))
try:
for r in self._executor.map(_news_page, slist):
results.extend(r)
except Exception as e:
raise e
next = resp_json.get("next")
if next is None or not max_results:
return results
payload["s"] = next.split("s=")[-1].split("&")[0]

return list(islice(results, max_results))
return results
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ classifiers = [
dependencies = [
"click>=8.1.7",
"primp>=0.9.1",
"lxml>=5.3.0",
]
dynamic = ["version"]

Expand All @@ -44,12 +45,10 @@ ddgs = "duckduckgo_search.cli:safe_entry_point"
version = {attr = "duckduckgo_search.version.__version__"}

[project.optional-dependencies]
lxml = [
"lxml>=5.3.0",
]
dev = [
"mypy>=1.13.0",
"pytest>=8.3.4",
"pytest-dependency>=0.6.0",
"ruff>=0.8.3",
]

Expand Down
Loading
Loading