Skip to content

Commit

Permalink
fix: [indexing] Avoid returning the complete dataset if no limit is g…
Browse files Browse the repository at this point in the history
…iven
  • Loading branch information
Rafiot committed Dec 12, 2024
1 parent 7df7ef7 commit b3189fb
Showing 1 changed file with 20 additions and 10 deletions.
30 changes: 20 additions & 10 deletions lookyloo/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
import re

from datetime import datetime
from datetime import datetime, timedelta

from pathlib import Path

Expand Down Expand Up @@ -169,6 +169,16 @@ def index_capture(self, uuid_to_index: str, directory: Path) -> None:
finally:
self.indexing_done(uuid_to_index)

def __limit_failsafe(self, oldest_capture: datetime | None=None, limit: int | None=None) -> float | str:
if limit:
if not oldest_capture:
return '-Inf'
return oldest_capture.timestamp()
# We have no limit set, we *must* set an oldest capture
if not oldest_capture:
return (datetime.now() - timedelta(days=2)).timestamp()
return oldest_capture.timestamp()

# ###### Cookies ######

def _reindex_cookies(self, cookie_name: str) -> None:
Expand Down Expand Up @@ -232,7 +242,7 @@ def get_captures_cookies_name(self, cookie_name: str, most_recent_capture: datet
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
if self.redis.type(f'cookies_names|{cookie_name}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_cookies', *[entry.split('|')[0] for entry in self.redis.smembers(f'cn|{cookie_name}|captures')])
Expand Down Expand Up @@ -331,7 +341,7 @@ def get_captures_body_hash(self, body_hash: str, most_recent_capture: datetime |
:param filter_capture_uuid: UUID of the capture the hash was found in
'''
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)

if self.redis.type(f'bh|{body_hash}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
Expand Down Expand Up @@ -409,7 +419,7 @@ def get_captures_hhhash(self, hhh: str, most_recent_capture: datetime | None = N
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
if self.redis.type(f'hhhashes|{hhh}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_hhhashes', *self.redis.smembers(f'hhhashes|{hhh}|captures'))
Expand Down Expand Up @@ -513,7 +523,7 @@ def get_captures_url(self, url: str, most_recent_capture: datetime | None = None
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
md5 = hashlib.md5(url.encode()).hexdigest()
if self.redis.type(f'urls|{md5}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
Expand Down Expand Up @@ -542,7 +552,7 @@ def get_captures_hostname(self, hostname: str, most_recent_capture: datetime | N
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
if self.redis.type(f'hostnames|{hostname}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_urls', *self.redis.smembers(f'hostnames|{hostname}|captures'))
Expand Down Expand Up @@ -628,7 +638,7 @@ def get_captures_tld(self, tld: str, most_recent_capture: datetime | None = None
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
total = self.redis.zcard(f'tlds|{tld}|captures')
return total, self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True, start=offset, num=limit)

Expand Down Expand Up @@ -693,7 +703,7 @@ def get_captures_favicon(self, favicon_sha512: str, most_recent_capture: datetim
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
total = self.redis.zcard(f'favicons|{favicon_sha512}|captures')
return total, self.redis.zrevrangebyscore(f'favicons|{favicon_sha512}|captures', max_score, min_score, withscores=True, start=offset, num=limit)

Expand Down Expand Up @@ -801,7 +811,7 @@ def get_captures_hash_type(self, hash_type: str, h: str, most_recent_capture: da
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
total = self.redis.zcard(f'capture_hash_types|{hash_type}|{h}|captures')
return total, self.redis.zrevrangebyscore(f'capture_hash_types|{hash_type}|{h}|captures', max_score, min_score, withscores=True, start=offset, num=limit)

Expand Down Expand Up @@ -884,7 +894,7 @@ def get_captures_identifier(self, identifier_type: str, identifier: str,
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
min_score: str | float = self.__limit_failsafe(oldest_capture, limit)
if self.redis.type(f'identifiers|{identifier_type}|{identifier}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_identifiers', *self.redis.smembers(f'identifiers|{identifier_type}|{identifier}|captures'))
Expand Down

0 comments on commit b3189fb

Please sign in to comment.