Skip to content

Commit

Permalink
v3.9.1 (#120)
Browse files Browse the repository at this point in the history
1. bugfix in text(backend="lite") - page iteration didn't work,
2. recode text(backend="api") - the payload["s"] parameter is now taken from the page,
3. text() -delays between requests during iteration have been removed,
4. small changes in tests.

The text() function now returns up to 500 unique results.
  • Loading branch information
deedy5 authored Oct 1, 2023
1 parent 1ce76a6 commit be8b6b2
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 66 deletions.
11 changes: 4 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,7 @@ asyncio.run(get_results())
[Go To TOP](#TOP)

## 1. text() - text search by duckduckgo.com
`html` and `lite` backend differ from `api`:</br>
* don't do an extra request first to get vqd,</br>
* use POST requests,</br>
* pause 0.75 seconds between paginations.</br>

If you use `html` or `lite` backend, pause at least 0.75 seconds between text() calls.
Otherwise the site will return a 403 status code after a few requests and block your ip for a few seconds.
```python
def text(
keywords: str,
Expand Down Expand Up @@ -339,6 +333,7 @@ with DDGS() as ddgs:
type_image=None,
layout=None,
license_image=None,
max_results=100,
)
for r in ddgs_images_gen:
print(r)
Expand Down Expand Up @@ -401,6 +396,7 @@ with DDGS() as ddgs:
timelimit="w",
resolution="high",
duration="medium",
max_results=100,
)
for r in ddgs_videos_gen:
print(r)
Expand Down Expand Up @@ -456,6 +452,7 @@ with DDGS() as ddgs:
region="wt-wt",
safesearch="off",
timelimit="m",
max_results=20
)
for r in ddgs_news_gen:
print(r)
Expand Down Expand Up @@ -519,7 +516,7 @@ def maps(
from duckduckgo_search import DDGS

with DDGS() as ddgs:
for r in ddgs.maps("school", place="Uganda"):
for r in ddgs.maps("school", place="Uganda", max_results=50):
print(r)
```
***Async***
Expand Down
39 changes: 19 additions & 20 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from lxml import html

from .models import MapsResult
from .utils import USERAGENTS, _is_500_in_url, _normalize, _normalize_url
from .utils import USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -66,17 +66,7 @@ def _get_vqd(self, keywords: str) -> Optional[str]:
"""Get vqd value for a search query."""
resp = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
if resp:
for c1, c2 in (
(b'vqd="', b'"'),
(b"vqd=", b"&"),
(b"vqd='", b"'"),
):
try:
start = resp.content.index(c1) + len(c1)
end = resp.content.index(c2, start)
return resp.content[start:end].decode()
except ValueError:
logger.warning(f"_get_vqd() keywords={keywords} vqd not found")
return _extract_vqd(resp.content)

def text(
self,
Expand Down Expand Up @@ -158,8 +148,7 @@ def _text_api(
payload["p"] = "1"

cache = set()
for s in ("0", "20", "70", "120"):
payload["s"] = s
for _ in range(10):
resp = self._get_url(
"GET", "https://links.duckduckgo.com/d.js", params=payload
)
Expand Down Expand Up @@ -189,8 +178,11 @@ def _text_api(
"href": _normalize_url(href),
"body": body,
}
if result_exists is False:
else:
next_page_url = row.get("n", None)
if result_exists is False or next_page_url is None:
break
payload["s"] = next_page_url.split("s=")[1].split("&")[0]

def _text_html(
self,
Expand Down Expand Up @@ -259,7 +251,7 @@ def _text_html(
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
sleep(0.75)
# sleep(0.75)

def _text_lite(
self,
Expand All @@ -282,12 +274,14 @@ def _text_lite(

payload = {
"q": keywords,
"s": "0",
"o": "json",
"api": "d.js",
"kl": region,
"df": timelimit,
}
cache: Set[str] = set()
for s in ("0", "20", "70", "120"):
payload["s"] = s
for _ in range(10):
resp = self._get_url(
"POST", "https://lite.duckduckgo.com/lite/", data=payload
)
Expand Down Expand Up @@ -324,9 +318,14 @@ def _text_lite(
"href": _normalize_url(href),
"body": _normalize(body),
}
if result_exists is False:
next_page_s = tree.xpath(
"//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value"
)
if result_exists is False or not next_page_s:
break
sleep(0.75)
payload["s"] = next_page_s[0]
payload["vqd"] = _extract_vqd(resp.content)
# sleep(0.75)

def images(
self,
Expand Down
39 changes: 19 additions & 20 deletions duckduckgo_search/duckduckgo_search_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from lxml import html

from .models import MapsResult
from .utils import USERAGENTS, _is_500_in_url, _normalize, _normalize_url
from .utils import USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,17 +68,7 @@ async def _get_vqd(self, keywords: str) -> Optional[str]:
"POST", "https://duckduckgo.com", data={"q": keywords}
)
if resp:
for c1, c2 in (
(b'vqd="', b'"'),
(b"vqd=", b"&"),
(b"vqd='", b"'"),
):
try:
start = resp.content.index(c1) + len(c1)
end = resp.content.index(c2, start)
return resp.content[start:end].decode()
except ValueError:
logger.warning(f"_get_vqd() keywords={keywords} vqd not found")
return _extract_vqd(resp.content)

async def text(
self,
Expand Down Expand Up @@ -162,8 +152,7 @@ async def _text_api(
payload["p"] = "1"

cache = set()
for s in ("0", "20", "70", "120"):
payload["s"] = s
for _ in range(10):
resp = await self._get_url(
"GET", "https://links.duckduckgo.com/d.js", params=payload
)
Expand Down Expand Up @@ -193,8 +182,11 @@ async def _text_api(
"href": _normalize_url(href),
"body": body,
}
if result_exists is False:
else:
next_page_url = row.get("n", None)
if result_exists is False or next_page_url is None:
break
payload["s"] = next_page_url.split("s=")[1].split("&")[0]

async def _text_html(
self,
Expand Down Expand Up @@ -263,7 +255,7 @@ async def _text_html(
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
await asyncio.sleep(0.75)
# await asyncio.sleep(0.75)

async def _text_lite(
self,
Expand All @@ -286,12 +278,14 @@ async def _text_lite(

payload = {
"q": keywords,
"s": "0",
"o": "json",
"api": "d.js",
"kl": region,
"df": timelimit,
}
cache: Set[str] = set()
for s in ("0", "20", "70", "120"):
payload["s"] = s
for _ in range(10):
resp = await self._get_url(
"POST", "https://lite.duckduckgo.com/lite/", data=payload
)
Expand Down Expand Up @@ -328,9 +322,14 @@ async def _text_lite(
"href": _normalize_url(href),
"body": _normalize(body),
}
if result_exists is False:
next_page_s = tree.xpath(
"//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value"
)
if result_exists is False or not next_page_s:
break
await asyncio.sleep(0.75)
payload["s"] = next_page_s[0]
payload["vqd"] = _extract_vqd(resp.content)
# await asyncio.sleep(0.75)

async def images(
self,
Expand Down
15 changes: 15 additions & 0 deletions duckduckgo_search/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from html import unescape
from typing import Optional
from urllib.parse import unquote

REGEX_500_IN_URL = re.compile(r"(?:\d{3}-\d{2}\.js)")
Expand All @@ -13,6 +14,20 @@
]


def _extract_vqd(html_bytes: bytes) -> Optional[str]:
for c1, c2 in (
(b'vqd="', b'"'),
(b"vqd=", b"&"),
(b"vqd='", b"'"),
):
try:
start = html_bytes.index(c1) + len(c1)
end = html_bytes.index(c2, start)
return html_bytes[start:end].decode()
except ValueError:
pass


def _is_500_in_url(url: str) -> bool:
"""something like '506-00.js' inside the url"""
return bool(REGEX_500_IN_URL.search(url))
Expand Down
2 changes: 1 addition & 1 deletion duckduckgo_search/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.9.0"
__version__ = "3.9.1"
16 changes: 8 additions & 8 deletions tests/test_duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,28 @@

def test_text():
with DDGS() as ddgs:
results = [x for x in ddgs.text("cat", max_results=25)]
assert len(results) >= 25
results = [x for x in ddgs.text("cat", max_results=30)]
assert len(results) >= 30


def test_text_params():
with DDGS() as ddgs:
results = [
x for x in ddgs.text("cat", safesearch="off", timelimit="m", max_results=25)
x for x in ddgs.text("cat", safesearch="off", timelimit="m", max_results=30)
]
assert len(results) >= 25
assert len(results) >= 30


def test_text_html():
with DDGS() as ddgs:
results = [x for x in ddgs.text("eagle", backend="html", max_results=25)]
assert len(results) >= 25
results = [x for x in ddgs.text("eagle", backend="html", max_results=30)]
assert len(results) >= 30


def test_text_lite():
with DDGS() as ddgs:
results = [x for x in ddgs.text("dog", backend="lite", max_results=23)]
assert len(results) >= 23
results = [x for x in ddgs.text("dog", backend="lite", max_results=30)]
assert len(results) >= 30


def test_images():
Expand Down
20 changes: 10 additions & 10 deletions tests/test_duckduckgo_search_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
@pytest.mark.asyncio
async def test_text():
async with AsyncDDGS() as ddgs:
results = [x async for x in ddgs.text("cat", max_results=25)]
assert len(results) >= 25
results = [x async for x in ddgs.text("cat", max_results=30)]
assert len(results) >= 30


@pytest.mark.asyncio
Expand All @@ -16,24 +16,24 @@ async def test_text_params():
results = [
x
async for x in ddgs.text(
"cat", safesearch="off", timelimit="m", max_results=25
"cat", safesearch="off", timelimit="m", max_results=30
)
]
assert len(results) >= 25
assert len(results) >= 30


@pytest.mark.asyncio
async def test_text_html():
async with AsyncDDGS() as ddgs:
results = [x async for x in ddgs.text("eagle", backend="html", max_results=25)]
assert len(results) >= 25
results = [x async for x in ddgs.text("eagle", backend="html", max_results=30)]
assert len(results) >= 30


@pytest.mark.asyncio
async def test_text_lite():
async with AsyncDDGS() as ddgs:
results = [x async for x in ddgs.text("dog", backend="lite", max_results=23)]
assert len(results) >= 23
results = [x async for x in ddgs.text("dog", backend="lite", max_results=30)]
assert len(results) >= 30


@pytest.mark.asyncio
Expand All @@ -60,8 +60,8 @@ async def test_news():
@pytest.mark.asyncio
async def test_maps():
async with AsyncDDGS() as ddgs:
results = [x async for x in ddgs.maps("school", place="London", max_results=40)]
assert len(results) >= 40
results = [x async for x in ddgs.maps("school", place="London", max_results=30)]
assert len(results) >= 30


@pytest.mark.asyncio
Expand Down

0 comments on commit be8b6b2

Please sign in to comment.