mirror of https://github.com/CIRCL/lookyloo
chg: Use sorted cache whenever possible
parent
648d4d5b5b
commit
25c1a6dbce
|
@ -129,6 +129,8 @@ class Lookyloo():
|
||||||
ct = CrawledTree(har_files, capture_uuid)
|
ct = CrawledTree(har_files, capture_uuid)
|
||||||
self._ensure_meta(capture_dir, ct)
|
self._ensure_meta(capture_dir, ct)
|
||||||
self._resolve_dns(ct)
|
self._resolve_dns(ct)
|
||||||
|
# Force update cache of the capture (takes care of the incomplete redirect key)
|
||||||
|
self._set_capture_cache(capture_dir, force=True)
|
||||||
# getting the cache triggers an update of the said cache. We want it there.
|
# getting the cache triggers an update of the said cache. We want it there.
|
||||||
cache = self.capture_cache(capture_uuid)
|
cache = self.capture_cache(capture_uuid)
|
||||||
if not cache:
|
if not cache:
|
||||||
|
@ -467,14 +469,15 @@ class Lookyloo():
|
||||||
'''All the capture UUIDs present in the cache.'''
|
'''All the capture UUIDs present in the cache.'''
|
||||||
return self.redis.hkeys('lookup_dirs')
|
return self.redis.hkeys('lookup_dirs')
|
||||||
|
|
||||||
@property
|
def sorted_cache(self, capture_uuids: Iterable[str]=[]) -> List[CaptureCache]:
|
||||||
def sorted_cache(self) -> List[CaptureCache]:
|
|
||||||
'''Get all the captures in the cache, sorted by timestamp (new -> old).'''
|
'''Get all the captures in the cache, sorted by timestamp (new -> old).'''
|
||||||
all_cache: List[CaptureCache] = []
|
all_cache: List[CaptureCache] = []
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
capture_uuids = self.capture_uuids
|
|
||||||
if not capture_uuids:
|
if not capture_uuids:
|
||||||
# No cached captures at all
|
# Sort all captures
|
||||||
|
capture_uuids = self.capture_uuids
|
||||||
|
if not capture_uuids:
|
||||||
|
# No captures at all on the instance
|
||||||
return all_cache
|
return all_cache
|
||||||
for directory in self.redis.hmget('lookup_dirs', *capture_uuids):
|
for directory in self.redis.hmget('lookup_dirs', *capture_uuids):
|
||||||
if directory:
|
if directory:
|
||||||
|
@ -485,15 +488,13 @@ class Lookyloo():
|
||||||
c = CaptureCache(c)
|
c = CaptureCache(c)
|
||||||
if hasattr(c, 'timestamp'):
|
if hasattr(c, 'timestamp'):
|
||||||
all_cache.append(c)
|
all_cache.append(c)
|
||||||
return sorted(all_cache, key=operator.attrgetter('timestamp'), reverse=True)
|
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||||
|
return all_cache
|
||||||
|
|
||||||
def capture_cache(self, capture_uuid: str) -> Optional[CaptureCache]:
|
def capture_cache(self, capture_uuid: str) -> Optional[CaptureCache]:
|
||||||
"""Get the cache from redis.
|
"""Get the cache from redis.
|
||||||
NOTE: Doesn't try to build the pickle"""
|
NOTE: Doesn't try to build the pickle"""
|
||||||
capture_dir = self.lookup_capture_dir(capture_uuid)
|
capture_dir = self.lookup_capture_dir(capture_uuid)
|
||||||
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
|
|
||||||
# try to rebuild the cache
|
|
||||||
self._set_capture_cache(capture_dir, force=True)
|
|
||||||
cached: Dict[str, Any] = self.redis.hgetall(str(capture_dir))
|
cached: Dict[str, Any] = self.redis.hgetall(str(capture_dir))
|
||||||
if not cached:
|
if not cached:
|
||||||
self.logger.warning(f'No cache available for {capture_dir}.')
|
self.logger.warning(f'No cache available for {capture_dir}.')
|
||||||
|
@ -793,10 +794,8 @@ class Lookyloo():
|
||||||
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
||||||
captures: List[Tuple[str, str]] = []
|
captures: List[Tuple[str, str]] = []
|
||||||
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
||||||
for capture_uuid, url_uuid, url_hostname, _ in details:
|
cached_captures = self.sorted_cache([d[0] for d in details])
|
||||||
cache = self.capture_cache(capture_uuid)
|
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
||||||
if cache:
|
|
||||||
captures.append((capture_uuid, cache.title))
|
|
||||||
domains = self.indexing.get_body_hash_domains(body_hash)
|
domains = self.indexing.get_body_hash_domains(body_hash)
|
||||||
return captures, domains
|
return captures, domains
|
||||||
|
|
||||||
|
@ -824,14 +823,7 @@ class Lookyloo():
|
||||||
|
|
||||||
def get_url_occurrences(self, url: str, limit: int=20) -> List[Dict]:
|
def get_url_occurrences(self, url: str, limit: int=20) -> List[Dict]:
|
||||||
'''Get the most recent captures and URL nodes where the URL has been seen.'''
|
'''Get the most recent captures and URL nodes where the URL has been seen.'''
|
||||||
captures: List[CaptureCache] = []
|
captures = self.sorted_cache(self.indexing.get_captures_url(url))
|
||||||
for uuid in self.indexing.get_captures_url(url):
|
|
||||||
c = self.capture_cache(uuid)
|
|
||||||
if not c:
|
|
||||||
continue
|
|
||||||
if hasattr(c, 'timestamp'):
|
|
||||||
captures.append(c)
|
|
||||||
captures.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
|
||||||
|
|
||||||
to_return: List[Dict] = []
|
to_return: List[Dict] = []
|
||||||
for capture in captures[:limit]:
|
for capture in captures[:limit]:
|
||||||
|
@ -851,14 +843,7 @@ class Lookyloo():
|
||||||
|
|
||||||
def get_hostname_occurrences(self, hostname: str, with_urls_occurrences: bool=False, limit: int=20) -> List[Dict]:
|
def get_hostname_occurrences(self, hostname: str, with_urls_occurrences: bool=False, limit: int=20) -> List[Dict]:
|
||||||
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
|
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
|
||||||
captures: List[CaptureCache] = []
|
captures = self.sorted_cache(self.indexing.get_captures_hostname(hostname))
|
||||||
for uuid in self.indexing.get_captures_hostname(hostname):
|
|
||||||
c = self.capture_cache(uuid)
|
|
||||||
if not c:
|
|
||||||
continue
|
|
||||||
if hasattr(c, 'timestamp'):
|
|
||||||
captures.append(c)
|
|
||||||
captures.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
|
||||||
|
|
||||||
to_return: List[Dict] = []
|
to_return: List[Dict] = []
|
||||||
for capture in captures[:limit]:
|
for capture in captures[:limit]:
|
||||||
|
@ -886,11 +871,8 @@ class Lookyloo():
|
||||||
|
|
||||||
def get_cookie_name_investigator(self, cookie_name: str):
|
def get_cookie_name_investigator(self, cookie_name: str):
|
||||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||||
captures = []
|
cached_captures = self.sorted_cache([entry[0] for entry in self.indexing.get_cookies_names_captures(cookie_name)])
|
||||||
for capture_uuid, url_uuid in self.indexing.get_cookies_names_captures(cookie_name):
|
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
||||||
cache = self.capture_cache(capture_uuid)
|
|
||||||
if cache:
|
|
||||||
captures.append((capture_uuid, cache.title))
|
|
||||||
domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain))
|
domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain))
|
||||||
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
|
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
|
||||||
return captures, domains
|
return captures, domains
|
||||||
|
@ -1158,14 +1140,7 @@ class Lookyloo():
|
||||||
stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
|
stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
|
||||||
weeks_stats: Dict[int, Dict] = {}
|
weeks_stats: Dict[int, Dict] = {}
|
||||||
|
|
||||||
for uuid in self.capture_uuids:
|
for cache in self.sorted_cache():
|
||||||
# What we get here is in a random order. This look sorts the captures
|
|
||||||
cache = self.capture_cache(uuid)
|
|
||||||
if not cache:
|
|
||||||
# That shouldn't happen, a warning went in the logs.
|
|
||||||
continue
|
|
||||||
if not hasattr(cache, 'timestamp'):
|
|
||||||
continue
|
|
||||||
date_submission: datetime = cache.timestamp
|
date_submission: datetime = cache.timestamp
|
||||||
|
|
||||||
if date_submission.year not in stats:
|
if date_submission.year not in stats:
|
||||||
|
|
|
@ -511,9 +511,7 @@ def index_generic(show_hidden: bool=False, category: Optional[str]=None):
|
||||||
else:
|
else:
|
||||||
cut_time = None # type: ignore
|
cut_time = None # type: ignore
|
||||||
|
|
||||||
for cached in lookyloo.sorted_cache:
|
for cached in lookyloo.sorted_cache():
|
||||||
if not cached:
|
|
||||||
continue
|
|
||||||
if cut_time and cached.timestamp < cut_time:
|
if cut_time and cached.timestamp < cut_time:
|
||||||
continue
|
continue
|
||||||
if category:
|
if category:
|
||||||
|
|
Loading…
Reference in New Issue