mirror of https://github.com/CIRCL/lookyloo
fix: Indexes not updated on tree rebuild, better handling of tree cache
parent
48fc807e7d
commit
6e9e3990c4
|
@ -14,6 +14,7 @@ def main():
|
|||
p.wait()
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
|
||||
r.delete('shutdown')
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||
r.delete('tree_cache')
|
||||
Popen(['run_backend', '--stop'])
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ from har2tree import CrawledTree, Har2TreeError, HarFile
|
|||
from redis import Redis
|
||||
|
||||
from .context import Context
|
||||
from .indexing import Indexing
|
||||
from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile,
|
||||
MissingUUID, TreeNeedsRebuild)
|
||||
from .helpers import try_make_file, get_config
|
||||
|
@ -53,7 +54,7 @@ class CaptureCache():
|
|||
|
||||
@property
|
||||
def tree(self) -> CrawledTree:
|
||||
return load_pickle_tree(self.capture_dir)
|
||||
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime)
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
|
@ -62,8 +63,8 @@ def remove_pickle_tree(capture_dir: Path) -> None:
|
|||
pickle_file.unlink()
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def load_pickle_tree(capture_dir: Path) -> CrawledTree:
|
||||
@lru_cache(maxsize=256)
|
||||
def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
with pickle_file.open('rb') as _p:
|
||||
|
@ -89,6 +90,7 @@ class CapturesIndex(Mapping):
|
|||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
||||
self.redis = redis
|
||||
self.indexing = Indexing()
|
||||
self.contextualizer = contextualizer
|
||||
self.__cache: Dict[str, CaptureCache] = {}
|
||||
self._quick_init()
|
||||
|
@ -110,9 +112,10 @@ class CapturesIndex(Mapping):
|
|||
self.__cache[uuid] = cc
|
||||
return self.__cache[uuid]
|
||||
try:
|
||||
tree = load_pickle_tree(capture_dir)
|
||||
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||
except TreeNeedsRebuild:
|
||||
tree = self._create_pickle(capture_dir)
|
||||
self.indexing.new_internal_uuids(tree)
|
||||
self.__cache[uuid] = self._set_capture_cache(capture_dir, tree)
|
||||
return self.__cache[uuid]
|
||||
|
||||
|
@ -193,7 +196,7 @@ class CapturesIndex(Mapping):
|
|||
# The pickle is being created somewhere else, wait until it's done.
|
||||
while lock_file.exists():
|
||||
time.sleep(5)
|
||||
return load_pickle_tree(capture_dir)
|
||||
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
|
@ -251,7 +254,7 @@ class CapturesIndex(Mapping):
|
|||
if har.initial_redirects and har.need_tree_redirects:
|
||||
if not tree:
|
||||
# try to load tree from disk
|
||||
tree = load_pickle_tree(capture_dir)
|
||||
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||
# get redirects
|
||||
if tree:
|
||||
cache['redirects'] = json.dumps(tree.redirects)
|
||||
|
|
|
@ -27,6 +27,13 @@ class Indexing():
|
|||
def redis(self):
|
||||
return Redis(connection_pool=self.redis_pool)
|
||||
|
||||
def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:
|
||||
# only trigger this method if the capture was already indexed.
|
||||
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
|
||||
self._reindex_cookies_capture(crawled_tree)
|
||||
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
|
||||
self._reindex_body_hashes_capture(crawled_tree)
|
||||
|
||||
# ###### Cookies ######
|
||||
|
||||
@property
|
||||
|
@ -45,6 +52,25 @@ class Indexing():
|
|||
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
|
||||
return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
|
||||
|
||||
def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None:
|
||||
pipeline = self.redis.pipeline()
|
||||
already_loaded: Set[Tuple[str, str]] = set()
|
||||
already_cleaned_up = []
|
||||
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
||||
if hasattr(urlnode, 'cookies_received'):
|
||||
for domain, cookie, _ in urlnode.cookies_received:
|
||||
name, value = cookie.split('=', 1)
|
||||
if (name, domain) in already_loaded:
|
||||
# Only add cookie name once / capture
|
||||
continue
|
||||
already_loaded.add((name, domain))
|
||||
if name not in already_cleaned_up:
|
||||
for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*'):
|
||||
pipeline.srem(f'cn|{name}|captures', key)
|
||||
already_cleaned_up.append(name)
|
||||
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
||||
pipeline.execute()
|
||||
|
||||
def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
|
||||
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
|
||||
# Do not reindex
|
||||
|
@ -107,6 +133,19 @@ class Indexing():
|
|||
to_return['hash_domains_freq'] = int(hash_domains_freq)
|
||||
return to_return
|
||||
|
||||
def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
|
||||
# if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed
|
||||
cleaned_up_hashes = []
|
||||
pipeline = self.redis.pipeline()
|
||||
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
||||
for h in urlnode.resources_hashes:
|
||||
if h not in cleaned_up_hashes:
|
||||
self.redis.delete(f'bh|{h}|captures|{crawled_tree.uuid}')
|
||||
cleaned_up_hashes.append(h)
|
||||
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
|
||||
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
|
||||
pipeline.execute()
|
||||
|
||||
def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
|
||||
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
|
||||
# Do not reindex
|
||||
|
|
|
@ -220,7 +220,10 @@ def hostnode_popup(tree_uuid: str, node_uuid: str):
|
|||
'request_cookie': {'icon': "cookie_read.png", 'tooltip': 'There are cookies in the request'}
|
||||
}
|
||||
|
||||
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
|
||||
try:
|
||||
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
|
||||
except IndexError:
|
||||
return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.')
|
||||
|
||||
return render_template('hostname_popup.html',
|
||||
tree_uuid=tree_uuid,
|
||||
|
|
Loading…
Reference in New Issue