fix: Indexes not updated on tree rebuild, better handling of tree cache

pull/267/head
Raphaël Vinot 2021-09-24 16:16:41 +02:00
parent 48fc807e7d
commit 6e9e3990c4
4 changed files with 53 additions and 7 deletions

View File

@ -14,6 +14,7 @@ def main():
p.wait() p.wait()
r = Redis(unix_socket_path=get_socket_path('cache'), db=1) r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
r.delete('shutdown') r.delete('shutdown')
r = Redis(unix_socket_path=get_socket_path('cache'))
r.delete('tree_cache') r.delete('tree_cache')
Popen(['run_backend', '--stop']) Popen(['run_backend', '--stop'])

View File

@ -19,6 +19,7 @@ from har2tree import CrawledTree, Har2TreeError, HarFile
from redis import Redis from redis import Redis
from .context import Context from .context import Context
from .indexing import Indexing
from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile, from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile,
MissingUUID, TreeNeedsRebuild) MissingUUID, TreeNeedsRebuild)
from .helpers import try_make_file, get_config from .helpers import try_make_file, get_config
@ -53,7 +54,7 @@ class CaptureCache():
@property @property
def tree(self) -> CrawledTree: def tree(self) -> CrawledTree:
return load_pickle_tree(self.capture_dir) return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime)
def remove_pickle_tree(capture_dir: Path) -> None: def remove_pickle_tree(capture_dir: Path) -> None:
@ -62,8 +63,8 @@ def remove_pickle_tree(capture_dir: Path) -> None:
pickle_file.unlink() pickle_file.unlink()
@lru_cache(maxsize=1024) @lru_cache(maxsize=256)
def load_pickle_tree(capture_dir: Path) -> CrawledTree: def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
pickle_file = capture_dir / 'tree.pickle' pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists(): if pickle_file.exists():
with pickle_file.open('rb') as _p: with pickle_file.open('rb') as _p:
@ -89,6 +90,7 @@ class CapturesIndex(Mapping):
self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel')) self.logger.setLevel(get_config('generic', 'loglevel'))
self.redis = redis self.redis = redis
self.indexing = Indexing()
self.contextualizer = contextualizer self.contextualizer = contextualizer
self.__cache: Dict[str, CaptureCache] = {} self.__cache: Dict[str, CaptureCache] = {}
self._quick_init() self._quick_init()
@ -110,9 +112,10 @@ class CapturesIndex(Mapping):
self.__cache[uuid] = cc self.__cache[uuid] = cc
return self.__cache[uuid] return self.__cache[uuid]
try: try:
tree = load_pickle_tree(capture_dir) tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
except TreeNeedsRebuild: except TreeNeedsRebuild:
tree = self._create_pickle(capture_dir) tree = self._create_pickle(capture_dir)
self.indexing.new_internal_uuids(tree)
self.__cache[uuid] = self._set_capture_cache(capture_dir, tree) self.__cache[uuid] = self._set_capture_cache(capture_dir, tree)
return self.__cache[uuid] return self.__cache[uuid]
@ -193,7 +196,7 @@ class CapturesIndex(Mapping):
# The pickle is being created somewhere else, wait until it's done. # The pickle is being created somewhere else, wait until it's done.
while lock_file.exists(): while lock_file.exists():
time.sleep(5) time.sleep(5)
return load_pickle_tree(capture_dir) return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
har_files = sorted(capture_dir.glob('*.har')) har_files = sorted(capture_dir.glob('*.har'))
pickle_file = capture_dir / 'tree.pickle' pickle_file = capture_dir / 'tree.pickle'
@ -251,7 +254,7 @@ class CapturesIndex(Mapping):
if har.initial_redirects and har.need_tree_redirects: if har.initial_redirects and har.need_tree_redirects:
if not tree: if not tree:
# try to load tree from disk # try to load tree from disk
tree = load_pickle_tree(capture_dir) tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
# get redirects # get redirects
if tree: if tree:
cache['redirects'] = json.dumps(tree.redirects) cache['redirects'] = json.dumps(tree.redirects)

View File

@ -27,6 +27,13 @@ class Indexing():
def redis(self): def redis(self):
return Redis(connection_pool=self.redis_pool) return Redis(connection_pool=self.redis_pool)
def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:
# only trigger this method if the capture was already indexed.
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
self._reindex_cookies_capture(crawled_tree)
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
self._reindex_body_hashes_capture(crawled_tree)
# ###### Cookies ###### # ###### Cookies ######
@property @property
@ -45,6 +52,25 @@ class Indexing():
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]: def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None:
pipeline = self.redis.pipeline()
already_loaded: Set[Tuple[str, str]] = set()
already_cleaned_up = []
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if hasattr(urlnode, 'cookies_received'):
for domain, cookie, _ in urlnode.cookies_received:
name, value = cookie.split('=', 1)
if (name, domain) in already_loaded:
# Only add cookie name once / capture
continue
already_loaded.add((name, domain))
if name not in already_cleaned_up:
for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*'):
pipeline.srem(f'cn|{name}|captures', key)
already_cleaned_up.append(name)
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
pipeline.execute()
def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_cookies', crawled_tree.uuid): if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
# Do not reindex # Do not reindex
@ -107,6 +133,19 @@ class Indexing():
to_return['hash_domains_freq'] = int(hash_domains_freq) to_return['hash_domains_freq'] = int(hash_domains_freq)
return to_return return to_return
def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
# if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed
cleaned_up_hashes = []
pipeline = self.redis.pipeline()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
for h in urlnode.resources_hashes:
if h not in cleaned_up_hashes:
self.redis.delete(f'bh|{h}|captures|{crawled_tree.uuid}')
cleaned_up_hashes.append(h)
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
pipeline.execute()
def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
# Do not reindex # Do not reindex

View File

@ -220,7 +220,10 @@ def hostnode_popup(tree_uuid: str, node_uuid: str):
'request_cookie': {'icon': "cookie_read.png", 'tooltip': 'There are cookies in the request'} 'request_cookie': {'icon': "cookie_read.png", 'tooltip': 'There are cookies in the request'}
} }
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid) try:
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
except IndexError:
return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.')
return render_template('hostname_popup.html', return render_template('hostname_popup.html',
tree_uuid=tree_uuid, tree_uuid=tree_uuid,