Skip to content

Commit

Permalink
DDGS.text: deprecate backend='api'
Browse files Browse the repository at this point in the history
  • Loading branch information
deedy5 committed Dec 26, 2024
1 parent 630d598 commit 3ee8e08
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 80 deletions.
64 changes: 5 additions & 59 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
_extract_vqd,
_normalize,
_normalize_url,
_text_extract_json,
json_loads,
)

Expand Down Expand Up @@ -221,7 +220,6 @@ def text(
timelimit: d, w, m, y. Defaults to None.
backend: auto, api, html, lite. Defaults to auto.
auto - try all backends in random order,
api - collect data from https://duckduckgo.com,
html - collect data from https://html.duckduckgo.com,
lite - collect data from https://lite.duckduckgo.com,
ecosia - collect data from https://www.ecosia.com.
Expand All @@ -235,16 +233,16 @@ def text(
RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
"""

backends = ["api", "html", "lite", "ecosia"] if backend == "auto" else [backend]
if backend == "api":
warnings.warn("'api' backend is deprecated, using backend='auto'", stacklevel=2)
backend = "auto"
backends = ["html", "lite", "ecosia"] if backend == "auto" else [backend]
shuffle(backends)

results, err = [], None
for b in backends:
try:
if b == "api":
results = self._text_api(keywords, region, safesearch, timelimit, max_results)
elif b == "html":
if b == "html":
results = self._text_html(keywords, region, timelimit, max_results)
elif b == "lite":
results = self._text_lite(keywords, region, timelimit, max_results)
Expand All @@ -257,58 +255,6 @@ def text(

raise DuckDuckGoSearchException(err)

def _text_api(
self,
keywords: str,
region: str = "wt-wt",
safesearch: str = "moderate",
timelimit: str | None = None,
max_results: int | None = None,
) -> list[dict[str, str]]:
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)

payload = {
"q": keywords,
"kl": region,
"l": region,
"p": "1" if safesearch == "on" else "",
"s": "0",
"df": timelimit or "",
"vqd": vqd,
"bing_market": f"{region[3:]}-{region[:2].upper()}",
"ex": "-1" if safesearch == "moderate" else "-2" if safesearch == "off" else "",
}

cache = set()
results: list[dict[str, str]] = []

for _ in range(3):
resp_content = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
page_data = _text_extract_json(resp_content, keywords)
for row in page_data:
href = row.get("u")
if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
cache.add(href)
body = _normalize(row["a"])
if body:
results.append(
{
"title": _normalize(row["t"]),
"href": _normalize_url(href),
"body": body,
}
)
if max_results and len(results) >= max_results:
return results
else:
next_page_url = row.get("n")
if not next_page_url or not max_results:
return results
payload["s"] = next_page_url.split("s=")[1].split("&")[0]
return results

def _text_html(
self,
keywords: str,
Expand Down
13 changes: 0 additions & 13 deletions duckduckgo_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,6 @@ def _extract_vqd(html_bytes: bytes, keywords: str) -> str:
raise DuckDuckGoSearchException(f"_extract_vqd() {keywords=} Could not extract vqd.")


def _text_extract_json(html_bytes: bytes, keywords: str) -> list[dict[str, str]]:
"""text(backend="api") -> extract json from html."""
try:
start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
end = html_bytes.index(b");DDG.", start)
data = html_bytes[start:end]
result: list[dict[str, str]] = json_loads(data)
return result
except Exception as ex:
raise DuckDuckGoSearchException(f"_text_extract_json() {keywords=} {type(ex).__name__}: {ex}") from ex
raise DuckDuckGoSearchException(f"_text_extract_json() {keywords=} return None")


def _normalize(raw_html: str) -> str:
"""Strip HTML tags from the raw_html string."""
return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
Expand Down
12 changes: 4 additions & 8 deletions tests/test_duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,21 @@ def test_chat(model):
assert len(results) >= 1


def test_text():
results = DDGS().text("cat", safesearch="off", timelimit="m", max_results=20)
assert 15 <= len(results) <= 20


def test_text_html():
results = DDGS().text("eagle", backend="html", max_results=20)
results = DDGS().text("eagle", backend="html", region="br-pt", timelimit="y", max_results=20)
assert 15 <= len(results) <= 20


def test_text_lite():
results = DDGS().text("dog", backend="lite", max_results=20)
results = DDGS().text("dog", backend="lite", region="br-pt", timelimit="y", max_results=20)
assert 15 <= len(results) <= 20


def test_text_ecosia():
results = DDGS().text("dog", region="br-pt", safesearch="off", backend="ecosia", max_results=20)
results = DDGS().text("cat", backend="ecosia", region="br-pt", safesearch="off", max_results=20)
assert 15 <= len(results) <= 20


def test_images():
results = DDGS().images("flower", max_results=200)
assert 85 <= len(results) <= 200
Expand Down

0 comments on commit 3ee8e08

Please sign in to comment.