chg: Improve initial caching.

pull/116/head
Raphaël Vinot 2020-10-29 23:25:20 +01:00
parent 69f65c9a87
commit 8b1e3585ea
2 changed files with 35 additions and 30 deletions

View File

@ -36,7 +36,7 @@ def main():
if cache.get('no_index') is not None: if cache.get('no_index') is not None:
index = False index = False
# NOTE: these methods do nothing if we just generated the pickle # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
if index: if index:
indexing.index_cookies_capture(tree) indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree) indexing.index_body_hashes_capture(tree)

View File

@ -367,7 +367,7 @@ class Lookyloo():
to_return['pi'][ct.root_hartree.har.root_url] = self.pi.get_url_lookup(ct.root_hartree.har.root_url) to_return['pi'][ct.root_hartree.har.root_url] = self.pi.get_url_lookup(ct.root_hartree.har.root_url)
return to_return return to_return
def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None: def _set_capture_cache(self, capture_dir: Path, force: bool=False, redis_pipeline: Optional[Redis]=None) -> None:
if force or not self.redis.exists(str(capture_dir)): if force or not self.redis.exists(str(capture_dir)):
# (re)build cache # (re)build cache
pass pass
@ -404,37 +404,40 @@ class Lookyloo():
error_cache['error'] = f'No har files in {capture_dir.name}' error_cache['error'] = f'No har files in {capture_dir.name}'
fatal_error = True fatal_error = True
if not redis_pipeline:
p = self.redis.pipeline()
else:
p = redis_pipeline
p.hset('lookup_dirs', uuid, str(capture_dir))
if error_cache: if error_cache:
self.logger.warning(error_cache['error']) self.logger.warning(error_cache['error'])
self.redis.hmset(str(capture_dir), error_cache) # type: ignore p.hmset(str(capture_dir), error_cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
if fatal_error: if not fatal_error:
return redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = load_pickle_tree(capture_dir)
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
redirects = har.initial_redirects cache: Dict[str, Union[str, int]] = {'uuid': uuid,
incomplete_redirects = False 'title': har.initial_title,
if redirects and har.need_tree_redirects: 'timestamp': har.initial_start_time,
# load tree from disk, get redirects 'url': har.root_url,
ct = load_pickle_tree(capture_dir) 'redirects': json.dumps(redirects),
if ct: 'capture_dir': str(capture_dir),
redirects = ct.redirects 'incomplete_redirects': 1 if incomplete_redirects else 0}
else: if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
# Pickle not available cache['no_index'] = 1
incomplete_redirects = True
cache: Dict[str, Union[str, int]] = {'uuid': uuid, p.hmset(str(capture_dir), cache)
'title': har.initial_title, if not redis_pipeline:
'timestamp': har.initial_start_time, p.execute()
'url': har.root_url,
'redirects': json.dumps(redirects),
'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
cache['no_index'] = 1
self.redis.hmset(str(capture_dir), cache) # type: ignore
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
def hide_capture(self, capture_uuid: str) -> None: def hide_capture(self, capture_uuid: str) -> None:
"""Add the capture in the hidden pool (not shown on the front page) """Add the capture in the hidden pool (not shown on the front page)
@ -493,10 +496,12 @@ class Lookyloo():
return {} return {}
def _init_existing_dumps(self) -> None: def _init_existing_dumps(self) -> None:
p = self.redis.pipeline()
for capture_dir in self.capture_dirs: for capture_dir in self.capture_dirs:
if capture_dir.exists(): if capture_dir.exists():
self._set_capture_cache(capture_dir) self._set_capture_cache(capture_dir, redis_pipeline=p)
self.redis.set('cache_loaded', 1) p.set('cache_loaded', 1)
p.execute()
@property @property
def capture_dirs(self) -> List[Path]: def capture_dirs(self) -> List[Path]: