From 6f9101a5dc95686748b4290c2632f33be5cdab41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Mon, 20 Jan 2025 00:53:05 +0100
Subject: [PATCH] chg: scan when searching, get all entries when in tables

---
 .pre-commit-config.yaml |  2 +-
 lookyloo/indexing.py    |  7 ++++++
 website/web/__init__.py | 52 ++++++++++++++++++-----------------------
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 48f0eb8f..3ce0f868 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
     -   id: check-yaml
     -   id: check-added-large-files
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.19.0
+    rev: v3.19.1
     hooks:
     -   id: pyupgrade
         args: [--py39-plus]
diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py
index bd240716..6d90b5f1 100644
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@@ -5,6 +5,7 @@
 import hashlib
 import logging
 import re
+from collections.abc import Iterator
 
 from datetime import datetime, timedelta
 
@@ -351,6 +352,9 @@ def get_captures_body_hash(self, body_hash: str, most_recent_capture: datetime |
         total = self.redis.zcard(f'body_hashes|{body_hash}|captures')
         return total, self.redis.zrevrangebyscore(f'body_hashes|{body_hash}|captures', max_score, min_score, withscores=True, start=offset, num=limit)
 
+    def scan_captures_body_hash(self, body_hash: str) -> Iterator[tuple[str, float]]:
+        yield from self.redis.zscan_iter(f'body_hashes|{body_hash}|captures')
+
     def get_capture_body_hash_nodes(self, capture_uuid: str, body_hash: str) -> set[str]:
         if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|body_hashes|{body_hash}'):
             return set(url_nodes)
@@ -428,6 +432,9 @@ def get_captures_hhhash(self, hhh: str, most_recent_capture: datetime | None = N
         total = self.redis.zcard(f'hhhashes|{hhh}|captures')
         return total, self.redis.zrevrangebyscore(f'hhhashes|{hhh}|captures', max_score, min_score, withscores=True, start=offset, num=limit)
 
+    def scan_captures_hhhash(self, hhh: str) -> Iterator[tuple[str, float]]:
+        yield from self.redis.zscan_iter(f'hhhashes|{hhh}|captures')
+
     def get_captures_hhhash_count(self, hhh: str) -> int:
         return self.redis.zcard(f'hhhashes|{hhh}|captures')
 
diff --git a/website/web/__init__.py b/website/web/__init__.py
index fd5bf85a..79081b3e 100644
--- a/website/web/__init__.py
+++ b/website/web/__init__.py
@@ -359,21 +359,18 @@ def handle_pydandic_validation_exception(error: CaptureSettingsError) -> Respons
 
 # ##### Methods querying the indexes #####
 
-def get_body_hash_investigator_search(body_hash: str, offset: int, limit: int, search: str) -> tuple[int, list[CaptureCache]]:
+def get_body_hash_investigator_search(body_hash: str, search: str) -> tuple[int, list[CaptureCache]]:
     cached_captures: list[CaptureCache] = []
-    while True:
-        total, entries = get_indexing(flask_login.current_user).get_captures_body_hash(body_hash, offset=offset, limit=limit)
-        cached_captures += [capture for capture in lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) if capture.search(search)]
-        offset += limit
-        if total < offset:
-            break
+    total = get_indexing(flask_login.current_user).get_captures_body_hash_count(body_hash)
+    entries = [uuid for uuid, _ in get_indexing(flask_login.current_user).scan_captures_body_hash(body_hash)]
+    cached_captures += [capture for capture in lookyloo.sorted_capture_cache(entries, cached_captures_only=False) if capture.search(search)]
     return total, cached_captures
 
 
-def _get_body_hash_investigator(body_hash: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, datetime, str, str]]]:
+def _get_body_hash_investigator(body_hash: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]:
     '''Returns all the captures related to a hash (sha512), used in the web interface.'''
-    if offset is not None and limit and search:
-        total, cached_captures = get_body_hash_investigator_search(body_hash=body_hash, offset=offset, limit=limit, search=search)
+    if search:
+        total, cached_captures = get_body_hash_investigator_search(body_hash=body_hash, search=search)
     else:
         total, entries = get_indexing(flask_login.current_user).get_captures_body_hash(body_hash=body_hash, offset=offset, limit=limit)
         cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
@@ -386,7 +383,7 @@ def _get_body_hash_investigator(body_hash: str, offset: int | None=None, limit:
                 nodes_info.append((urlnode.name, urlnode_uuid))
             except IndexError:
                 continue
-        captures.append((cache.uuid, cache.title, cache.timestamp, cache.redirects[-1], nodes_info))
+        captures.append((cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, nodes_info))
     return total, captures
 
 
@@ -443,7 +440,7 @@ def get_all_urls(capture_uuid: str, /) -> dict[str, dict[str, int | list[URLNode
 def get_hostname_investigator(hostname: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]:
     '''Returns all the captures loading content from that hostname, used in the web interface.'''
     total, entries = get_indexing(flask_login.current_user).get_captures_hostname(hostname=hostname, offset=offset, limit=limit)
-    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries])
+    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
     _captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_hostname_nodes(cache.uuid, hostname)) for cache in cached_captures]
     captures = []
     for capture_uuid, capture_title, landing_page, capture_ts, nodes in _captures:
@@ -479,7 +476,7 @@ def get_url_investigator(url: str, offset: int | None=None, limit: int | None=No
 def get_cookie_name_investigator(cookie_name: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]:
     '''Returns all the captures related to a cookie name entry, used in the web interface.'''
     total, entries = get_indexing(flask_login.current_user).get_captures_cookies_name(cookie_name=cookie_name)
-    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries])
+    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
     _captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_cookie_name_nodes(cache.uuid, cookie_name)) for cache in cached_captures]
     captures = []
     for capture_uuid, capture_title, landing_page, capture_ts, nodes in _captures:
@@ -496,14 +493,14 @@ def get_cookie_name_investigator(cookie_name: str, offset: int | None=None, limi
 
 def get_identifier_investigator(identifier_type: str, identifier: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime]]]:
     total, entries = get_indexing(flask_login.current_user).get_captures_identifier(identifier_type=identifier_type, identifier=identifier, offset=offset, limit=limit)
-    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries])
+    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
     captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
     return total, captures
 
 
 def get_capture_hash_investigator(hash_type: str, h: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime]]]:
     total, entries = get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h, offset=offset, limit=limit)
-    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries])
+    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
     captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
     return total, captures
 
@@ -511,29 +508,26 @@ def get_capture_hash_investigator(hash_type: str, h: str, offset: int | None=Non
 def get_favicon_investigator(favicon_sha512: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime]]]:
     '''Returns all the captures related to a cookie name entry, used in the web interface.'''
     total, entries = get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512=favicon_sha512, offset=offset, limit=limit)
-    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries])
+    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
     captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
     return total, captures
 
 
-def get_hhh_investigator_search(hhh: str, offset: int, limit: int, search: str) -> tuple[int, list[CaptureCache]]:
+def get_hhh_investigator_search(hhh: str, search: str) -> tuple[int, list[CaptureCache]]:
     cached_captures: list[CaptureCache] = []
-    while True:
-        total, entries = get_indexing(flask_login.current_user).get_captures_hhhash(hhh, offset=offset, limit=limit)
-        cached_captures += [capture for capture in lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False) if capture.search(search)]
-        offset += limit
-        if total < offset:
-            break
+    total = get_indexing(flask_login.current_user).get_captures_hhhash_count(hhh)
+    entries = [uuid for uuid, _ in get_indexing(flask_login.current_user).scan_captures_hhhash(hhh)]
+    cached_captures += [capture for capture in lookyloo.sorted_capture_cache(entries, cached_captures_only=False) if capture.search(search)]
     return total, cached_captures
 
 
 def get_hhh_investigator(hhh: str, offset: int | None=None, limit: int | None=None, search: str | None=None) -> tuple[int, list[tuple[str, str, str, datetime, list[tuple[str, str]]]]]:
     '''Returns all the captures related to a cookie name entry, used in the web interface.'''
-    if offset is not None and limit and search:
-        total, cached_captures = get_hhh_investigator_search(hhh, offset=offset, limit=limit, search=search)
+    if search:
+        total, cached_captures = get_hhh_investigator_search(hhh, search=search)
     else:
         total, entries = get_indexing(flask_login.current_user).get_captures_hhhash(hhh, offset=offset, limit=limit)
-        cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries])
+        cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in entries], cached_captures_only=False)
 
     _captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp, get_indexing(flask_login.current_user).get_capture_hhhash_nodes(cache.uuid, hhh)) for cache in cached_captures]
     captures = []
@@ -2084,7 +2078,7 @@ def post_table(table_name: str, value: str) -> Response:
     if table_name == 'HHHDetailsTable':
         hhh = value.strip()
         total, captures = get_hhh_investigator(hhh, offset=start, limit=length, search=search)
-        if search:
+        if search and start is not None and length is not None:
             total_filtered = len(captures)
             captures = captures[start:start + length]
         prepared_captures = []
@@ -2124,11 +2118,11 @@ def post_table(table_name: str, value: str) -> Response:
     if table_name == 'bodyHashDetailsTable':
         body_hash = value.strip()
         total, captures = _get_body_hash_investigator(body_hash, offset=start, limit=length, search=search)
-        if search:
+        if search and start is not None and length is not None:
             total_filtered = len(captures)
             captures = captures[start:start + length]
         prepared_captures = []
-        for capture_uuid, title, capture_time, landing_page, nodes in captures:
+        for capture_uuid, title, landing_page, capture_time, nodes in captures:
             _nodes = __prepare_node_view(capture_uuid, nodes, from_popup)
             to_append = {
                 'capture_time': capture_time.isoformat(),