diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..91abb11 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..749dd1a --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,43 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ '*' ] + pull_request: + branches: [ '*' ] + +jobs: + build: + + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.13"] + os: [ubuntu-latest, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install -U pip + python -m pip install .[dev] + - name: Ruff + run: | + ruff check . + ruff format . --check + - name: Mypy + run: | + python -m pip install lxml-stubs + mypy . + - name: Pytest + run: | + pytest diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..82f8dbd --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,70 @@ +# This workflow will upload a Python Package to PyPI when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + + # Dedicated environments with protections for publishing are strongly recommended. + # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules + environment: + name: pypi + # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: + # url: https://pypi.org/p/YOURPROJECT + # + # ALTERNATIVE: if your GitHub Release name is the PyPI project version string + # ALTERNATIVE: exactly, uncomment the following line instead: + # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f8bdff4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,166 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Visual Studio Code +.vscode/ + +# ruff Python linter +.ruff_cache/ \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..16e5469 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 deedy5 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c705060 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +![Python >= 3.8](https://img.shields.io/badge/python->=3.8-red.svg) [![](https://badgen.net/github/release/deedy5/searxng_search)](https://github.com/deedy5/searxng_search/releases) [![](https://badge.fury.io/py/searxng-search.svg)](https://pypi.org/project/searxng-search) [![Downloads](https://static.pepy.tech/badge/searxng-search)](https://pepy.tech/project/searxng-search) [![Downloads](https://static.pepy.tech/badge/searxng-search/week)](https://pepy.tech/project/searxng-search) +# searxng_search + +Web search using the searxng instances. + +## Table of Contents +* [Install](#install) +* [SearxngSearch class](#searxngsearch-class) + * [Proxy](#proxy) + * [Exceptions](#exceptions) + * [search()](#search) + +___ +## Install +```python +pip install -U searxng_search +``` +___ +## SearxngSearch class +```python3 +"""Searxng search. Query params: https://docs.searxng.org/dev/search_api.html. + +Args: + q: search query. + language: code of the language. Defaults to "auto". + pageno: search page number. Defaults to 1. + time_range: "day", "week", "month", "year". Defaults to "". + safesearch: 0, 1, 2. Defaults to 1. +""" +``` + +Here is an example of initializing the SeaxngSearch class. +```python3 +from searxng_search import SearxngSearch + +results = SearxngSearch().search("python") +print(results) +``` +___ +### Proxy + +Package supports http/https/socks proxies. Example: `http://user:pass@example.com:3128`. +Use a rotating proxy. Otherwise, use a new proxy with each SearxngSearch class initialization. + +*1. The easiest way. Launch the Tor Browser* +```python3 +from searxng_search import SearxngSearch + +ss = SearxngSearch(proxy="socks5://127.0.0.1:9150", timeout=20) +results = SS.search("python") +``` +*2. Use any proxy server* (*example with [iproyal rotating residential proxies](https://iproyal.com?r=residential_proxies)*) +```python3 +from searxng_search import SearxngSearch + +ss = SearxngSearch(proxy="socks5h://user:password@geo.iproyal.com:32325", timeout=20) +results = ss.text("something you need") +``` +___ +### Exceptions + +Exceptions: +- `SearxngSearchException`: Base exception for searxng_search errors. +- `RatelimitException`: Inherits from SearxngSearchException, raised for exceeding request rate limits. +- `TimeoutException`: Inherits from SearxngSearchException, raised for request timeouts. +___ +### search() + +```python +def search( + self, + q: str, + language: str = "auto", + pageno: str | int = 1, + time_range: str = "", + safesearch: str | int = 1, +) -> list[dict[str, str]]: + """Searxng search. Query params: https://docs.searxng.org/dev/search_api.html. + + Args: + q: search query. + language: code of the language. Defaults to "auto". + pageno: search page number. Defaults to 1. + time_range: "day", "week", "month", "year". Defaults to "". + safesearch: 0, 1, 2. Defaults to 1. + + Returns: + List of dictionaries with search results. +``` +***Example*** +```python +from searxng_search import SS # SS = SearxngSearch (alias) + +results = SS().search("python", language="fr", pageno=4, time_range="year", safesearch=0) +print(results) +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..654d9ad --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,66 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "searxng_search" +description = "Web search using the searxng instances" +readme = "README.md" +requires-python = ">=3.8" +license = {text = "MIT License"} +keywords = ["python", "searxng"] +authors = [ + {name = "deedy5"} +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "primp>=0.9.2", + "lxml>=5.3.0", +] +dynamic = ["version"] + +[project.urls] # Optional +"Homepage" = "https://github.com/deedy5/searxng_search" + +[tool.setuptools.dynamic] +version = {attr = "searxng_search.version.__version__"} + +[project.optional-dependencies] +dev = [ + "mypy>=1.13.0", + "pytest>=8.3.4", + "ruff>=0.8.3", +] + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "E", # pycodestyle + "F", # Pyflakes + "UP", # pyupgrade + "B", # flake8-bugbear + "SIM", # flake8-simplify + "I", # isort +] + +[tool.mypy] +python_version = "3.8" +strict = true +exclude = ["build/"] \ No newline at end of file diff --git a/searxng_search/__init__.py b/searxng_search/__init__.py new file mode 100644 index 0000000..e724aae --- /dev/null +++ b/searxng_search/__init__.py @@ -0,0 +1,12 @@ +import logging + +from .searxng import SearxngSearch +from .version import __version__ + +SS = SearxngSearch +__all__ = ["SearxngSearch", "SS", "__version__"] + + +# A do-nothing logging handler +# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library +logging.getLogger("searxng_search").addHandler(logging.NullHandler()) diff --git a/searxng_search/exceptions.py b/searxng_search/exceptions.py new file mode 100644 index 0000000..4aae661 --- /dev/null +++ b/searxng_search/exceptions.py @@ -0,0 +1,10 @@ +class SearxngSearchException(Exception): + """Base exception class for searxng_search.""" + + +class RatelimitException(SearxngSearchException): + """Raised for rate limit exceeded errors during API requests.""" + + +class TimeoutException(SearxngSearchException): + """Raised for timeout errors during API requests.""" diff --git a/searxng_search/py.typed b/searxng_search/py.typed new file mode 100644 index 0000000..e5aff4f --- /dev/null +++ b/searxng_search/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561. \ No newline at end of file diff --git a/searxng_search/searxng.py b/searxng_search/searxng.py new file mode 100644 index 0000000..5628b55 --- /dev/null +++ b/searxng_search/searxng.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import logging +import os +from functools import cached_property +from pathlib import Path +from random import choice +from time import time +from types import TracebackType +from typing import cast + +import primp # type: ignore +from lxml.etree import _Element +from lxml.html import HTMLParser as LHTMLParser +from lxml.html import document_fromstring + +from .exceptions import RatelimitException, SearxngSearchException, TimeoutException + +logger = logging.getLogger("SearxngSearch") + + +class SearxngSearch: + """Searxng search class to get search results searxng instances""" + + _searxng_proxy: str | None = os.environ.get("SEARXNG_PROXY") + _impersonates = ( + "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", + "chrome_108", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", + "chrome_119", "chrome_120", "chrome_123", "chrome_124", "chrome_126", "chrome_127", + "chrome_128", "chrome_129", "chrome_130", "chrome_131", + "safari_ios_16.5", "safari_ios_17.2", "safari_ios_17.4.1", "safari_ios_18.1.1", + "safari_15.3", "safari_15.5", "safari_15.6.1", "safari_16", "safari_16.5", + "safari_17.0", "safari_17.2.1", "safari_17.4.1", "safari_17.5", + "safari_18", "safari_18.2", + "safari_ipad_18", + "edge_101", "edge_122", "edge_127", "edge_131", + "firefox_109", "firefox_133", + ) # fmt: skip + + def __init__( + self, + headers: dict[str, str] | None = None, + proxy: str | None = None, + timeout: int | None = 15, + verify: bool = True, + ) -> None: + """Initialize the SearxngSearch object. + + Args: + headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None. + proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols. + example: "http://user:pass@example.com:3128". Defaults to None. + timeout (int, optional): Timeout value for the HTTP client. Defaults to 10. + verify (bool): SSL verification when making the request. Defaults to True. + """ + self.proxy: str | None = self._searxng_proxy or proxy + self.headers = headers or {} + self.impersonate = choice(self._impersonates) + self.client = primp.Client( + headers=self.headers, + proxy=self.proxy, + timeout=timeout, + impersonate=self.impersonate, + follow_redirects=True, + verify=verify, + ) + self.searxng_instances = self._load_searxng_instances() + self.searxng_instance = choice(self.searxng_instances) + + def __enter__(self) -> SearxngSearch: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_val: BaseException | None = None, + exc_tb: TracebackType | None = None, + ) -> None: + pass + + @cached_property + def parser(self) -> LHTMLParser: + """Get HTML parser.""" + return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False) + + def _get_url( + self, + method: str, + url: str, + params: dict[str, str] | None = None, + ) -> bytes: + try: + resp = self.client.request(method, url, params=params) + except Exception as ex: + if "time" in str(ex).lower(): + raise TimeoutException(f"{url} {type(ex).__name__}: {ex}") from ex + raise SearxngSearchException(f"{url} {type(ex).__name__}: {ex}") from ex + if resp.status_code == 200: + return cast(bytes, resp.content) + elif resp.status_code == 429: + raise RatelimitException(f"{resp.url} {resp.status_code} Ratelimit") + raise SearxngSearchException(f"{resp.url} return None. {params=}") + + def _get_searxng_instances(self) -> list[str]: + url = "https://searx.space/data/instances.json" + resp = primp.Client( + impersonate=self.impersonate, + proxy=self.proxy, + ).get(url) + data = resp.json() + instances = data.get("instances") + results = [] + for k, v in instances.items(): + if ( + v["network_type"] == "normal" + and v["http"]["status_code"] == 200 + and (v["engines"].get("bing") or v["engines"].get("google")) + and (v.get("timing", {}).get("initial", {}).get("success_percentage") == 100) + and (v.get("timing", {}).get("search", {}).get("success_percentage") == 100) + and (v.get("timing", {}).get("search", {}).get("all", {}).get("median") <= 1) + and (v.get("timing", {}).get("search_go", {}).get("success_percentage") == 100) + ): + results.append(f"{k}search") + return results + + def _load_searxng_instances(self) -> list[str]: + file_path = Path.home() / "searxng_instances.txt" + data = [] + if file_path.exists() and time() - file_path.stat().st_mtime < 3600: + with open(file_path) as file: + data = file.read().split() + elif not data: + data = self._get_searxng_instances() + + with open(file_path, "w", encoding="utf-8") as file: + file.write("\n".join(data)) + return data + + def search( + self, + q: str, + language: str = "auto", + pageno: str | int = 1, + time_range: str = "", + safesearch: str | int = 1, + ) -> list[dict[str, str]]: + """Searxng search. Query params: https://docs.searxng.org/dev/search_api.html. + + Args: + q: search query. + language: code of the language. Defaults to "auto". + pageno: search page number. Defaults to 1. + time_range: "day", "week", "month", "year". Defaults to "". + safesearch: 0, 1, 2. Defaults to 1. + + Returns: + List of dictionaries with search results. + + Raises: + SearxngSearchException: Base exception for searxng_search errors. + RatelimitException: Inherits from SearxngSearchException, raised for exceeding API request rate limits. + TimeoutException: Inherits from SearxngSearchException, raised for API request timeouts. + """ + assert q, "q is mandatory" + + payload = { + "q": q, + "category_general": "1", + "pageno": f"{pageno}", + "language": language, + "time_range": time_range, + "safesearch": f"{safesearch}", + "theme": "simple", + } + + results: list[dict[str, str]] = [] + + resp_content = self._get_url("POST", self.searxng_instance, params=payload) + if b"No results were found" in resp_content: + return results + + tree = document_fromstring(resp_content, self.parser) + + tokenxpath = tree.xpath("//head//link[contains(@href, '/client')]/@href") + token = str(tokenxpath[0]).lstrip("/searxng") if isinstance(tokenxpath, list) else None + if token: + primp.Client( + impersonate=self.impersonate, + proxy=self.proxy, + ).post( + f"{self.searxng_instance.rstrip('search')}{token}", + ) + + elements = tree.xpath("//div[@role='main']//article") + if not isinstance(elements, list): + return results + + for e in elements: + if isinstance(e, _Element): + hrefxpath = e.xpath(".//h3/a/@href") + href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None + if href: + titlexpath = e.xpath(".//h3//text()") + title = ( + str("".join(str(x) for x in titlexpath)) if titlexpath and isinstance(titlexpath, list) else "" + ) + bodyxpath = e.xpath(".//p//text()") + body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, list) else "" + results.append( + { + "title": title, + "href": href, + "body": body.strip(), + } + ) + + return results diff --git a/searxng_search/version.py b/searxng_search/version.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/searxng_search/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..c7c24b5 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Automated tests. Run with pytest.""" diff --git a/tests/test_searxng_search.py b/tests/test_searxng_search.py new file mode 100644 index 0000000..28ed864 --- /dev/null +++ b/tests/test_searxng_search.py @@ -0,0 +1,12 @@ +from searxng_search import SS, SearxngSearch + + +def test_context_manager() -> None: + with SearxngSearch() as sxs: + results = sxs.search("cars", pageno=2) + assert len(results) >= 5 + + +def test_text_html() -> None: + results = SS().search("eagle", safesearch=0, language="br", time_range="year", pageno=2) + assert len(results) >= 5