fix: Indexes not updated on tree rebuild, better handling of tree cache

pull/267/head
Raphaël Vinot 2021-09-24 16:16:41 +02:00
parent 48fc807e7d
commit 6e9e3990c4
4 changed files with 53 additions and 7 deletions

View File

@ -14,6 +14,7 @@ def main():
p.wait()
r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
r.delete('shutdown')
r = Redis(unix_socket_path=get_socket_path('cache'))
r.delete('tree_cache')
Popen(['run_backend', '--stop'])

View File

@ -19,6 +19,7 @@ from har2tree import CrawledTree, Har2TreeError, HarFile
from redis import Redis
from .context import Context
from .indexing import Indexing
from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile,
MissingUUID, TreeNeedsRebuild)
from .helpers import try_make_file, get_config
@ -53,7 +54,7 @@ class CaptureCache():
@property
def tree(self) -> CrawledTree:
return load_pickle_tree(self.capture_dir)
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime)
def remove_pickle_tree(capture_dir: Path) -> None:
@ -62,8 +63,8 @@ def remove_pickle_tree(capture_dir: Path) -> None:
pickle_file.unlink()
@lru_cache(maxsize=1024)
def load_pickle_tree(capture_dir: Path) -> CrawledTree:
@lru_cache(maxsize=256)
def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
with pickle_file.open('rb') as _p:
@ -89,6 +90,7 @@ class CapturesIndex(Mapping):
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel'))
self.redis = redis
self.indexing = Indexing()
self.contextualizer = contextualizer
self.__cache: Dict[str, CaptureCache] = {}
self._quick_init()
@ -110,9 +112,10 @@ class CapturesIndex(Mapping):
self.__cache[uuid] = cc
return self.__cache[uuid]
try:
tree = load_pickle_tree(capture_dir)
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
except TreeNeedsRebuild:
tree = self._create_pickle(capture_dir)
self.indexing.new_internal_uuids(tree)
self.__cache[uuid] = self._set_capture_cache(capture_dir, tree)
return self.__cache[uuid]
@ -193,7 +196,7 @@ class CapturesIndex(Mapping):
# The pickle is being created somewhere else, wait until it's done.
while lock_file.exists():
time.sleep(5)
return load_pickle_tree(capture_dir)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
har_files = sorted(capture_dir.glob('*.har'))
pickle_file = capture_dir / 'tree.pickle'
@ -251,7 +254,7 @@ class CapturesIndex(Mapping):
if har.initial_redirects and har.need_tree_redirects:
if not tree:
# try to load tree from disk
tree = load_pickle_tree(capture_dir)
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
# get redirects
if tree:
cache['redirects'] = json.dumps(tree.redirects)

View File

@ -27,6 +27,13 @@ class Indexing():
def redis(self):
return Redis(connection_pool=self.redis_pool)
def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:
# only trigger this method if the capture was already indexed.
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
self._reindex_cookies_capture(crawled_tree)
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
self._reindex_body_hashes_capture(crawled_tree)
# ###### Cookies ######
@property
@ -45,6 +52,25 @@ class Indexing():
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None:
pipeline = self.redis.pipeline()
already_loaded: Set[Tuple[str, str]] = set()
already_cleaned_up = []
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if hasattr(urlnode, 'cookies_received'):
for domain, cookie, _ in urlnode.cookies_received:
name, value = cookie.split('=', 1)
if (name, domain) in already_loaded:
# Only add cookie name once / capture
continue
already_loaded.add((name, domain))
if name not in already_cleaned_up:
for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*'):
pipeline.srem(f'cn|{name}|captures', key)
already_cleaned_up.append(name)
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
pipeline.execute()
def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
# Do not reindex
@ -107,6 +133,19 @@ class Indexing():
to_return['hash_domains_freq'] = int(hash_domains_freq)
return to_return
def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
# if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed
cleaned_up_hashes = []
pipeline = self.redis.pipeline()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
for h in urlnode.resources_hashes:
if h not in cleaned_up_hashes:
self.redis.delete(f'bh|{h}|captures|{crawled_tree.uuid}')
cleaned_up_hashes.append(h)
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
pipeline.execute()
def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
# Do not reindex

View File

@ -220,7 +220,10 @@ def hostnode_popup(tree_uuid: str, node_uuid: str):
'request_cookie': {'icon': "cookie_read.png", 'tooltip': 'There are cookies in the request'}
}
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
try:
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
except IndexError:
return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.')
return render_template('hostname_popup.html',
tree_uuid=tree_uuid,