From 6e9e3990c4d5d96fff42cc738a4103ae24a6ce2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 24 Sep 2021 16:16:41 +0200 Subject: [PATCH] fix: Indexes not updated on tree rebuild, better handling of tree cache --- bin/stop.py | 1 + lookyloo/capturecache.py | 15 +++++++++------ lookyloo/indexing.py | 39 +++++++++++++++++++++++++++++++++++++++ website/web/__init__.py | 5 ++++- 4 files changed, 53 insertions(+), 7 deletions(-) diff --git a/bin/stop.py b/bin/stop.py index 27db0441..9bf7c1fd 100755 --- a/bin/stop.py +++ b/bin/stop.py @@ -14,6 +14,7 @@ def main(): p.wait() r = Redis(unix_socket_path=get_socket_path('cache'), db=1) r.delete('shutdown') + r = Redis(unix_socket_path=get_socket_path('cache')) r.delete('tree_cache') Popen(['run_backend', '--stop']) diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index 8511354f..f44b6fd2 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -19,6 +19,7 @@ from har2tree import CrawledTree, Har2TreeError, HarFile from redis import Redis from .context import Context +from .indexing import Indexing from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild) from .helpers import try_make_file, get_config @@ -53,7 +54,7 @@ class CaptureCache(): @property def tree(self) -> CrawledTree: - return load_pickle_tree(self.capture_dir) + return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime) def remove_pickle_tree(capture_dir: Path) -> None: @@ -62,8 +63,8 @@ def remove_pickle_tree(capture_dir: Path) -> None: pickle_file.unlink() -@lru_cache(maxsize=1024) -def load_pickle_tree(capture_dir: Path) -> CrawledTree: +@lru_cache(maxsize=256) +def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree: pickle_file = capture_dir / 'tree.pickle' if pickle_file.exists(): with pickle_file.open('rb') as _p: @@ -89,6 +90,7 @@ class CapturesIndex(Mapping): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) self.redis = redis + self.indexing = Indexing() self.contextualizer = contextualizer self.__cache: Dict[str, CaptureCache] = {} self._quick_init() @@ -110,9 +112,10 @@ class CapturesIndex(Mapping): self.__cache[uuid] = cc return self.__cache[uuid] try: - tree = load_pickle_tree(capture_dir) + tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) except TreeNeedsRebuild: tree = self._create_pickle(capture_dir) + self.indexing.new_internal_uuids(tree) self.__cache[uuid] = self._set_capture_cache(capture_dir, tree) return self.__cache[uuid] @@ -193,7 +196,7 @@ class CapturesIndex(Mapping): # The pickle is being created somewhere else, wait until it's done. while lock_file.exists(): time.sleep(5) - return load_pickle_tree(capture_dir) + return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) har_files = sorted(capture_dir.glob('*.har')) pickle_file = capture_dir / 'tree.pickle' @@ -251,7 +254,7 @@ class CapturesIndex(Mapping): if har.initial_redirects and har.need_tree_redirects: if not tree: # try to load tree from disk - tree = load_pickle_tree(capture_dir) + tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) # get redirects if tree: cache['redirects'] = json.dumps(tree.redirects) diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 954c8b62..7bda57da 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -27,6 +27,13 @@ class Indexing(): def redis(self): return Redis(connection_pool=self.redis_pool) + def new_internal_uuids(self, crawled_tree: CrawledTree) -> None: + # only trigger this method if the capture was already indexed. + if self.redis.sismember('indexed_cookies', crawled_tree.uuid): + self._reindex_cookies_capture(crawled_tree) + if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): + self._reindex_body_hashes_capture(crawled_tree) + # ###### Cookies ###### @property @@ -45,6 +52,25 @@ class Indexing(): def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]: return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] + def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None: + pipeline = self.redis.pipeline() + already_loaded: Set[Tuple[str, str]] = set() + already_cleaned_up = [] + for urlnode in crawled_tree.root_hartree.url_tree.traverse(): + if hasattr(urlnode, 'cookies_received'): + for domain, cookie, _ in urlnode.cookies_received: + name, value = cookie.split('=', 1) + if (name, domain) in already_loaded: + # Only add cookie name once / capture + continue + already_loaded.add((name, domain)) + if name not in already_cleaned_up: + for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*'): + pipeline.srem(f'cn|{name}|captures', key) + already_cleaned_up.append(name) + pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') + pipeline.execute() + def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_cookies', crawled_tree.uuid): # Do not reindex @@ -107,6 +133,19 @@ class Indexing(): to_return['hash_domains_freq'] = int(hash_domains_freq) return to_return + def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: + # if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed + cleaned_up_hashes = [] + pipeline = self.redis.pipeline() + for urlnode in crawled_tree.root_hartree.url_tree.traverse(): + for h in urlnode.resources_hashes: + if h not in cleaned_up_hashes: + self.redis.delete(f'bh|{h}|captures|{crawled_tree.uuid}') + cleaned_up_hashes.append(h) + pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, + f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') + pipeline.execute() + def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): # Do not reindex diff --git a/website/web/__init__.py b/website/web/__init__.py index 47d3910e..e4dc7ecf 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -220,7 +220,10 @@ def hostnode_popup(tree_uuid: str, node_uuid: str): 'request_cookie': {'icon': "cookie_read.png", 'tooltip': 'There are cookies in the request'} } - hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid) + try: + hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid) + except IndexError: + return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.') return render_template('hostname_popup.html', tree_uuid=tree_uuid,