deedy5 · deedy5 · Dec 22, 2024 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/README.md b/README.md
@@ -22,9 +22,6 @@ AI chat and search for text, news, images and videos using the DuckDuckGo.com se
 ```python
 pip install -U duckduckgo_search
 ```
-> [!NOTE]
-> you can install lxml to use the `text` function with `backend='html'` or `backend='lite'` (size ≈ 12Mb)</br>
-> `pip install -U duckduckgo_search[lxml]`
 
 ## CLI version
 
@@ -235,7 +232,7 @@ def text(
     region: str = "wt-wt",
     safesearch: str = "moderate",
     timelimit: str | None = None,
-    backend: str = "api",
+    backend: str = "auto",
     max_results: int | None = None,
 ) -> list[dict[str, str]]:
     """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params.
@@ -245,7 +242,8 @@ def text(
         region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
         safesearch: on, moderate, off. Defaults to "moderate".
         timelimit: d, w, m, y. Defaults to None.
-        backend: api, html, lite. Defaults to api.
+        backend: auto, api, html, lite. Defaults to auto.
+            auto - try all backends in random order,
             api - collect data from https://duckduckgo.com,
             html - collect data from https://html.duckduckgo.com,
             lite - collect data from https://lite.duckduckgo.com.

diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
@@ -197,7 +197,7 @@ def chat(load, proxy, multiline, timeout, verify, model):
 @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
 @click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
 @click.option("-dd", "--download-directory", help="Specify custom download directory")
-@click.option("-b", "--backend", default="api", type=click.Choice(["api", "html", "lite"]), help="which backend to use")
+@click.option("-b", "--backend", default="auto", type=click.Choice(["auto", "api", "html", "lite"]))
 @click.option("-th", "--threads", default=10, help="download threads, default=10")
 @click.option("-p", "--proxy", help="the proxy to send requests, example: socks5://127.0.0.1:9150")
 @click.option("-v", "--verify", default=True, help="verify SSL when making the request")

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -3,25 +3,18 @@
 import logging
 import os
 import warnings
-from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
 from functools import cached_property
-from itertools import cycle, islice
-from random import choice
+from itertools import cycle
+from random import choice, shuffle
 from time import sleep, time
 from types import TracebackType
 from typing import cast
 
 import primp  # type: ignore
-
-try:
-    from lxml.etree import _Element
-    from lxml.html import HTMLParser as LHTMLParser
-    from lxml.html import document_fromstring
-
-    LXML_AVAILABLE = True
-except ImportError:
-    LXML_AVAILABLE = False
+from lxml.etree import _Element
+from lxml.html import HTMLParser as LHTMLParser
+from lxml.html import document_fromstring
 
 from .exceptions import ConversationLimitException, DuckDuckGoSearchException, RatelimitException, TimeoutException
 from .utils import (
@@ -39,7 +32,6 @@
 class DDGS:
     """DuckDuckgo_search class to get search results from duckduckgo.com."""
 
-    _executor: ThreadPoolExecutor = ThreadPoolExecutor()
     _impersonates = (
         "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", "chrome_108",
         "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119", "chrome_120",
@@ -215,7 +207,7 @@ def text(
         region: str = "wt-wt",
         safesearch: str = "moderate",
         timelimit: str | None = None,
-        backend: str = "api",
+        backend: str = "auto",
         max_results: int | None = None,
     ) -> list[dict[str, str]]:
         """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
@@ -225,7 +217,8 @@ def text(
             region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
             safesearch: on, moderate, off. Defaults to "moderate".
             timelimit: d, w, m, y. Defaults to None.
-            backend: api, html, lite. Defaults to api.
+            backend: auto, api, html, lite. Defaults to auto.
+                auto - try all backends in random order,
                 api - collect data from https://duckduckgo.com,
                 html - collect data from https://html.duckduckgo.com,
                 lite - collect data from https://lite.duckduckgo.com.
@@ -239,17 +232,25 @@ def text(
             RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
             TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
         """
-        if LXML_AVAILABLE is False and backend != "api":
-            backend = "api"
-            warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
-
-        if backend == "api":
-            results = self._text_api(keywords, region, safesearch, timelimit, max_results)
-        elif backend == "html":
-            results = self._text_html(keywords, region, timelimit, max_results)
-        elif backend == "lite":
-            results = self._text_lite(keywords, region, timelimit, max_results)
-        return results
+
+        backends = ["api", "html", "lite"] if backend == "auto" else [backend]
+        shuffle(backends)
+
+        results, err = [], None
+        for b in backends:
+            try:
+                if b == "api":
+                    results = self._text_api(keywords, region, safesearch, timelimit, max_results)
+                elif b == "html":
+                    results = self._text_html(keywords, region, timelimit, max_results)
+                elif b == "lite":
+                    results = self._text_lite(keywords, region, timelimit, max_results)
+                return results
+            except Exception as ex:
+                logger.info(f"Error to search using {b} backend: {ex}")
+                err = ex
+
+        raise DuckDuckGoSearchException(err)
 
     def _text_api(
         self,
@@ -298,7 +299,7 @@ def _text_api(
                             return results
                 else:
                     next_page_url = row.get("n")
-                    if not next_page_url:
+                    if not next_page_url or not max_results:
                         return results
                     payload["s"] = next_page_url.split("s=")[1].split("&")[0]
         return results
@@ -364,7 +365,7 @@ def _text_html(
                             return results
 
             npx = tree.xpath('.//div[@class="nav-link"]')
-            if not npx:
+            if not npx or not max_results:
                 return results
             next_page = npx[-1] if isinstance(npx, list) else None
             if isinstance(next_page, _Element):
@@ -446,7 +447,7 @@ def _text_lite(
                                 return results
 
             next_page_s = tree.xpath("//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value")
-            if not next_page_s:
+            if not next_page_s or not max_results:
                 return results
             elif isinstance(next_page_s, list):
                 payload["s"] = str(next_page_s[0])
@@ -516,13 +517,11 @@ def images(
         cache = set()
         results: list[dict[str, str]] = []
 
-        def _images_page(s: int) -> list[dict[str, str]]:
-            payload["s"] = f"{s}"
+        for _ in range(5):
             resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
             resp_json = json_loads(resp_content)
-
             page_data = resp_json.get("results", [])
-            page_results = []
+
             for row in page_data:
                 image_url = row.get("image")
                 if image_url and image_url not in cache:
@@ -536,20 +535,15 @@ def _images_page(s: int) -> list[dict[str, str]]:
                         "width": row["width"],
                         "source": row["source"],
                     }
-                    page_results.append(result)
-            return page_results
-
-        slist = [0]
-        if max_results:
-            max_results = min(max_results, 500)
-            slist.extend(range(100, max_results, 100))
-        try:
-            for r in self._executor.map(_images_page, slist):
-                results.extend(r)
-        except Exception as e:
-            raise e
+                    results.append(result)
+                    if max_results and len(results) >= max_results:
+                        return results
+            next = resp_json.get("next")
+            if next is None or not max_results:
+                return results
+            payload["s"] = next.split("s=")[-1].split("&")[0]
 
-        return list(islice(results, max_results))
+        return results
 
     def videos(
         self,
@@ -603,30 +597,23 @@ def videos(
         cache = set()
         results: list[dict[str, str]] = []
 
-        def _videos_page(s: int) -> list[dict[str, str]]:
-            payload["s"] = f"{s}"
+        for _ in range(8):
             resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
             resp_json = json_loads(resp_content)
-
             page_data = resp_json.get("results", [])
-            page_results = []
+
             for row in page_data:
                 if row["content"] not in cache:
                     cache.add(row["content"])
-                    page_results.append(row)
-            return page_results
-
-        slist = [0]
-        if max_results:
-            max_results = min(max_results, 200)
-            slist.extend(range(60, max_results, 60))
-        try:
-            for r in self._executor.map(_videos_page, slist):
-                results.extend(r)
-        except Exception as e:
-            raise e
+                    results.append(row)
+                    if max_results and len(results) >= max_results:
+                        return results
+            next = resp_json.get("next")
+            if next is None or not max_results:
+                return results
+            payload["s"] = next.split("s=")[-1].split("&")[0]
 
-        return list(islice(results, max_results))
+        return results
 
     def news(
         self,
@@ -672,12 +659,11 @@ def news(
         cache = set()
         results: list[dict[str, str]] = []
 
-        def _news_page(s: int) -> list[dict[str, str]]:
-            payload["s"] = f"{s}"
+        for _ in range(5):
             resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
             resp_json = json_loads(resp_content)
             page_data = resp_json.get("results", [])
-            page_results = []
+
             for row in page_data:
                 if row["url"] not in cache:
                     cache.add(row["url"])
@@ -690,17 +676,13 @@ def _news_page(s: int) -> list[dict[str, str]]:
                         "image": _normalize_url(image_url),
                         "source": row["source"],
                     }
-                    page_results.append(result)
-            return page_results
+                    results.append(result)
+                    if max_results and len(results) >= max_results:
+                        return results
 
-        slist = [0]
-        if max_results:
-            max_results = min(max_results, 120)
-            slist.extend(range(30, max_results, 30))
-        try:
-            for r in self._executor.map(_news_page, slist):
-                results.extend(r)
-        except Exception as e:
-            raise e
+            next = resp_json.get("next")
+            if next is None or not max_results:
+                return results
+            payload["s"] = next.split("s=")[-1].split("&")[0]
 
-        return list(islice(results, max_results))
+        return results
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ classifiers = [
 dependencies = [
     "click>=8.1.7",
     "primp>=0.9.1",
+    "lxml>=5.3.0",
 ]
 dynamic = ["version"]
 
@@ -44,12 +45,10 @@ ddgs = "duckduckgo_search.cli:safe_entry_point"
 version = {attr = "duckduckgo_search.version.__version__"}
 
 [project.optional-dependencies]
-lxml = [
-    "lxml>=5.3.0",
-]
 dev = [
     "mypy>=1.13.0",
     "pytest>=8.3.4",
+    "pytest-dependency>=0.6.0",
     "ruff>=0.8.3",
 ]