new: fast internal cache for index

pull/918/head
Raphaël Vinot 2024-05-22 00:38:35 +02:00
parent 5cba2a97e9
commit 172c54bae6
2 changed files with 8 additions and 4 deletions

View File

@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
from redis import Redis
from .context import Context
from .helpers import get_captures_dir, is_locked
from .helpers import get_captures_dir, is_locked, make_ts_from_dirname
from .indexing import Indexing
from .default import LookylooException, try_make_file, get_config
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
@ -260,11 +260,13 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
return None
p = self.redis.pipeline()
has_new_cached_captures = False
recent_captures = {}
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if uuid in self.__cache:
continue
has_new_cached_captures = True
p.hgetall(directory)
recent_captures[uuid] = make_ts_from_dirname(directory.rsplit('/', 1)[-1]).timestamp()
if not has_new_cached_captures:
return
for cache in p.execute():
@ -276,6 +278,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
self.logger.warning(f'Unable to initialize the cache: {e}')
continue
self.__cache[cc.uuid] = cc
self.redis.zadd('recent_captures', recent_captures)
def _get_capture_dir(self, uuid: str) -> str:
# Try to get from the recent captures cache in redis
@ -285,6 +288,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
return capture_dir
# The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', uuid)
self.redis.zrem('recent_captures', uuid)
self.redis.delete(capture_dir)
# Try to get from the archived captures cache in redis

View File

@ -58,7 +58,7 @@ from .exceptions import (MissingCaptureDirectory,
from .helpers import (get_captures_dir, get_email_template,
get_resources_hashes, get_taxonomies,
uniq_domains, ParsedUserAgent, load_cookies, UserAgents,
get_useragent_for_requests, make_ts_from_dirname, load_takedown_filters
get_useragent_for_requests, load_takedown_filters
)
from .modules import (MISPs, PhishingInitiative, UniversalWhois,
UrlScan, VirusTotal, Phishtank, Hashlookup,
@ -510,8 +510,7 @@ class Lookyloo():
index_cut_time = cut_time
if capture_uuids is None:
capture_uuids = {uuid for uuid, directory in self.redis.hscan_iter('lookup_dirs')
if make_ts_from_dirname(directory.rsplit('/', 1)[-1]) > index_cut_time}
capture_uuids = self.redis.zrevrangebyscore('recent_captures', '+inf', index_cut_time.timestamp())
# NOTE: we absolutely have to respect the cached_captures_only setting and
# never overwrite it. This method is called to display the index
# and if we try to display everything, including the non-cached entries,
@ -1503,3 +1502,4 @@ class Lookyloo():
_fw.write(favicon)
self.redis.hset('lookup_dirs', uuid, str(dirpath))
self.redis.zadd('recent_captures', {uuid: now.timestamp()})