From 6f9101a5dc95686748b4290c2632f33be5cdab41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 20 Jan 2025 00:53:05 +0100 Subject: [PATCH] chg: scan when searching, get all entries when in tables --- .pre-commit-config.yaml | 2 +- lookyloo/indexing.py | 7 ++++++ website/web/__init__.py | 52 ++++++++++++++++++----------------------- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 48f0eb8f..3ce0f868 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: - id: check-yaml - id: check-added-large-files - repo: https://github.com/asottile/pyupgrade - rev: v3.19.0 + rev: v3.19.1 hooks: - id: pyupgrade args: [--py39-plus] diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index bd240716..6d90b5f1 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -5,6 +5,7 @@ import hashlib import logging import re +from collections.abc import Iterator from datetime import datetime, timedelta @@ -351,6 +352,9 @@ def get_captures_body_hash(self, body_hash: str, most_recent_capture: datetime | total = self.redis.zcard(f'body_hashes|{body_hash}|captures') return total, self.redis.zrevrangebyscore(f'body_hashes|{body_hash}|captures', max_score, min_score, withscores=True, start=offset, num=limit) + def scan_captures_body_hash(self, body_hash: str) -> Iterator[tuple[str, float]]: + yield from self.redis.zscan_iter(f'body_hashes|{body_hash}|captures') + def get_capture_body_hash_nodes(self, capture_uuid: str, body_hash: str) -> set[str]: if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|body_hashes|{body_hash}'): return set(url_nodes) @@ -428,6 +432,9 @@ def get_captures_hhhash(self, hhh: str, most_recent_capture: datetime | None = N total = self.redis.zcard(f'hhhashes|{hhh}|captures') return total, self.redis.zrevrangebyscore(f'hhhashes|{hhh}|captures', max_score, min_score, withscores=True, start=offset, num=limit) + def scan_captures_hhhash(self, hhh: str) -> Iterator[tuple[str, float]]: + yield from self.redis.zscan_iter(f'hhhashes|{hhh}|captures') + def get_captures_hhhash_count(self, hhh: str) -> int: return self.redis.zcard(f'hhhashes|{hhh}|captures') diff --git a/website/web/__init__.py b/website/web/__init__.py index fd5bf85a..79081b3e 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -359,21 +359,18 @@ def handle_pydandic_validation_exception(error: CaptureSettingsError) -> Respons # ##### Methods querying the indexes ##### -def get_body_hash_investigator_search(body_hash: str, offset: int, limit: int, search: str) -> tuple[int, list[CaptureCache]]: +def get_body_hash_investigator_search(body_hash: str, search: str) -> tuple[int, list[CaptureCache]]: cached_captures: list[CaptureCache] = [] - while True: - total, entries = get_indexing(flask_login.current_user).get_captures_body_hash(body_hash, offset=offset, limit=limit) - cached_captures += [capture for capture in lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) if capture.search(search)] - offset += limit - if total < offset: - break + total = get_indexing(flask_login.current_user).get_captures_body_hash_count(body_hash) + entries = [uuid for uuid, _ in get_indexing(flask_login.current_user).scan_captures_body_hash(body_hash)] + cached_captures += [capture for capture in lookyloo.sorted_capture_cache(entries, cached_captures_only=False) if capture.search(search)] return total, cached_captures -def _get_body_hash_investigator(body_hash: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, datetime, str, str]]]: +def _get_body_hash_investigator(body_hash: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]: '''Returns all the captures related to a hash (sha512), used in the web interface.''' - if offset is not None and limit and search: - total, cached_captures = get_body_hash_investigator_search(body_hash=body_hash, offset=offset, limit=limit, search=search) + if search: + total, cached_captures = get_body_hash_investigator_search(body_hash=body_hash, search=search) else: total, entries = get_indexing(flask_login.current_user).get_captures_body_hash(body_hash=body_hash, offset=offset, limit=limit) cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) @@ -386,7 +383,7 @@ def _get_body_hash_investigator(body_hash: str, offset: int | None=None, limit: nodes_info.append((urlnode.name, urlnode_uuid)) except IndexError: continue - captures.append((cache.uuid, cache.title, cache.timestamp, cache.redirects[-1], nodes_info)) + captures.append((cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, nodes_info)) return total, captures @@ -443,7 +440,7 @@ def get_all_urls(capture_uuid: str, /) -> dict[str, dict[str, int | list[URLNode def get_hostname_investigator(hostname: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]: '''Returns all the captures loading content from that hostname, used in the web interface.''' total, entries = get_indexing(flask_login.current_user).get_captures_hostname(hostname=hostname, offset=offset, limit=limit) - cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries]) + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) _captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_hostname_nodes(cache.uuid, hostname)) for cache in cached_captures] captures = [] for capture_uuid, capture_title, landing_page, capture_ts, nodes in _captures: @@ -479,7 +476,7 @@ def get_url_investigator(url: str, offset: int | None=None, limit: int | None=No def get_cookie_name_investigator(cookie_name: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]: '''Returns all the captures related to a cookie name entry, used in the web interface.''' total, entries = get_indexing(flask_login.current_user).get_captures_cookies_name(cookie_name=cookie_name) - cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries]) + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) _captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_cookie_name_nodes(cache.uuid, cookie_name)) for cache in cached_captures] captures = [] for capture_uuid, capture_title, landing_page, capture_ts, nodes in _captures: @@ -496,14 +493,14 @@ def get_cookie_name_investigator(cookie_name: str, offset: int | None=None, limi def get_identifier_investigator(identifier_type: str, identifier: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime]]]: total, entries = get_indexing(flask_login.current_user).get_captures_identifier(identifier_type=identifier_type, identifier=identifier, offset=offset, limit=limit) - cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries]) + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] return total, captures def get_capture_hash_investigator(hash_type: str, h: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime]]]: total, entries = get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h, offset=offset, limit=limit) - cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries]) + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] return total, captures @@ -511,29 +508,26 @@ def get_capture_hash_investigator(hash_type: str, h: str, offset: int | None=Non def get_favicon_investigator(favicon_sha512: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime]]]: '''Returns all the captures related to a cookie name entry, used in the web interface.''' total, entries = get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512=favicon_sha512, offset=offset, limit=limit) - cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries]) + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] return total, captures -def get_hhh_investigator_search(hhh: str, offset: int, limit: int, search: str) -> tuple[int, list[CaptureCache]]: +def get_hhh_investigator_search(hhh: str, search: str) -> tuple[int, list[CaptureCache]]: cached_captures: list[CaptureCache] = [] - while True: - total, entries = get_indexing(flask_login.current_user).get_captures_hhhash(hhh, offset=offset, limit=limit) - cached_captures += [capture for capture in lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) if capture.search(search)] - offset += limit - if total < offset: - break + total = get_indexing(flask_login.current_user).get_captures_hhhash_count(hhh) + entries = [uuid for uuid, _ in get_indexing(flask_login.current_user).scan_captures_hhhash(hhh)] + cached_captures += [capture for capture in lookyloo.sorted_capture_cache(entries, cached_captures_only=False) if capture.search(search)] return total, cached_captures def get_hhh_investigator(hhh: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]: '''Returns all the captures related to a cookie name entry, used in the web interface.''' - if offset is not None and limit and search: - total, cached_captures = get_hhh_investigator_search(hhh, offset=offset, limit=limit, search=search) + if search: + total, cached_captures = get_hhh_investigator_search(hhh, search=search) else: total, entries = get_indexing(flask_login.current_user).get_captures_hhhash(hhh, offset=offset, limit=limit) - cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries]) + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) _captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_hhhash_nodes(cache.uuid, hhh)) for cache in cached_captures] captures = [] @@ -2084,7 +2078,7 @@ def post_table(table_name: str, value: str) -> Response: if table_name == 'HHHDetailsTable': hhh = value.strip() total, captures = get_hhh_investigator(hhh, offset=start, limit=length, search=search) - if search: + if search and start is not None and length is not None: total_filtered = len(captures) captures = captures[start:start + length] prepared_captures = [] @@ -2124,11 +2118,11 @@ def post_table(table_name: str, value: str) -> Response: if table_name == 'bodyHashDetailsTable': body_hash = value.strip() total, captures = _get_body_hash_investigator(body_hash, offset=start, limit=length, search=search) - if search: + if search and start is not None and length is not None: total_filtered = len(captures) captures = captures[start:start + length] prepared_captures = [] - for capture_uuid, title, capture_time, landing_page, nodes in captures: + for capture_uuid, title, landing_page, capture_time, nodes in captures: _nodes = __prepare_node_view(capture_uuid, nodes, from_popup) to_append = { 'capture_time': capture_time.isoformat(),