new: Archiver, refactoring.

pull/251/head
Raphaël Vinot 2021-08-20 17:46:22 +02:00
parent 6be9b69d95
commit 58b837cb6c
4 changed files with 123 additions and 65 deletions

80
bin/archiver.py Executable file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from collections import defaultdict
import csv
from datetime import datetime, timedelta
import logging
from typing import Dict, List, Tuple
from pathlib import Path
from lookyloo.abstractmanager import AbstractManager
from lookyloo.lookyloo import Lookyloo
from lookyloo.helpers import get_config
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
class Archiver(AbstractManager):
def __init__(self, loglevel: int=logging.INFO):
super().__init__(loglevel)
self.script_name = 'archiver'
def _to_run_forever(self):
self._archive()
def _archive(self):
# Initialize the lookyloo class here, no need to keep it in memory all the time.
lookyloo = Lookyloo()
# make sure archived captures dir exists
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
archived_captures_dir.mkdir(parents=True, exist_ok=True)
archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = datetime.now() - archive_interval
# Format:
# { 2020: { 12: [(directory, uuid)] } }
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
for capture_path in lookyloo.capture_dir.glob('*'):
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
if timestamp >= cut_time:
# do not archive.
continue
with (capture_path / 'uuid').open() as _f:
uuid = _f.read().strip()
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
archived_uuids = {}
for year, month_captures in to_archive.items():
for month, captures in month_captures.items():
dest_dir = archived_captures_dir / str(year) / str(month)
dest_dir.mkdir(parents=True, exist_ok=True)
if (dest_dir / 'index').exists():
with (dest_dir / 'index').open('r') as _f:
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)}
else:
current_index = {}
for capture_path, uuid in captures:
current_index[uuid] = capture_path.name
capture_path.rename(dest_dir / capture_path.name)
archived_uuids[uuid] = str(dest_dir / capture_path.name)
with (dest_dir / 'index').open('w') as _f:
index_writer = csv.writer(_f)
for uuid, dirname in current_index.items():
index_writer.writerow([uuid, dirname])
if archived_uuids:
lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys())
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
lookyloo.clear_captures_index_cache(archived_uuids.keys())
def main():
a = Archiver()
a.run(sleep_in_sec=3600 * 24)
if __name__ == '__main__':
main()

View File

@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
self._check_indexes()
def _build_missing_pickles(self):
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
if (uuid_path.parent / 'tree.pickle').exists():
continue
lock_file = uuid_path.parent / 'lock'
@ -45,15 +45,17 @@ class BackgroundIndexer(AbstractManager):
with uuid_path.open() as f:
uuid = f.read()
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
self.logger.info(f'Pickle for {uuid} build.')
except MissingUUID:
# The cache is not up-to-date, but the UUID definitely exists in the captures.
self.logger.warning(f'Unable to find {uuid}, re-triggering the cache.')
self.lookyloo._set_capture_cache(uuid_path.parent, force=True)
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile:
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
# The capture is not working, moving it away.

View File

@ -99,9 +99,6 @@ class Lookyloo():
self.context = Context(self.sanejs)
self._captures_index: Dict[str, CaptureCache] = {}
if not self.redis.exists('cache_loaded'):
self._init_existing_dumps()
@property
def redis(self):
return Redis(connection_pool=self.redis_pool)
@ -186,8 +183,6 @@ class Lookyloo():
self._ensure_meta(capture_dir, ct)
self._resolve_dns(ct)
self.context.contextualize_tree(ct)
# Force update cache of the capture (takes care of the incomplete redirect key)
self._set_capture_cache(capture_dir, force=True)
cache = self.capture_cache(capture_uuid)
if not cache:
raise LookylooException(f'Broken cache for {capture_dir}')
@ -310,13 +305,14 @@ class Lookyloo():
remove_pickle_tree(capture_dir)
def rebuild_cache(self) -> None:
'''Flush and rebuild the redis cache. Doesn't remove the pickles.'''
'''Flush and rebuild the redis cache. Doesn't remove the pickles.
The cached captures will be rebuild when loading the index.'''
self.redis.flushdb()
self._init_existing_dumps()
def rebuild_all(self) -> None:
'''Flush and rebuild the redis cache, and delede all the pickles.'''
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dirs] # type: ignore
'''Flush and rebuild the redis cache, and delete all the pickles.
The captures will be rebuilt by the background indexer'''
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()] # type: ignore
self.rebuild_cache()
def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
@ -468,15 +464,9 @@ class Lookyloo():
to_return[event_id].update(values)
return to_return
def _set_capture_cache(self, capture_dir: Path, force: bool=False, redis_pipeline: Optional[Redis]=None) -> None:
'''Populate the redis cache for a capture. Mostly used on the index page.'''
# NOTE: this method is called in the background indexer as a fallback.
if force or not self.redis.exists(str(capture_dir)):
# (re)build cache
pass
else:
return
def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
'''Populate the redis cache for a capture. Mostly used on the index page.
NOTE: Doesn't require the pickle.'''
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
@ -513,10 +503,7 @@ class Lookyloo():
else:
categories = []
if not redis_pipeline:
p = self.redis.pipeline()
else:
p = redis_pipeline
p = self.redis.pipeline()
p.hset('lookup_dirs', uuid, str(capture_dir))
if error_cache:
if 'HTTP Error' not in error_cache['error']:
@ -551,10 +538,10 @@ class Lookyloo():
cache['parent'] = f.read().strip()
p.hmset(str(capture_dir), cache)
if not redis_pipeline:
p.execute()
p.execute()
# If the cache is re-created for some reason, pop from the local cache.
self._captures_index.pop(uuid, None)
return cache
def hide_capture(self, capture_uuid: str, /) -> None:
"""Add the capture in the hidden pool (not shown on the front page)
@ -580,7 +567,9 @@ class Lookyloo():
# No captures at all on the instance
return []
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if uuid in self._captures_index and not self._captures_index[uuid].incomplete_redirects]
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids
if (uuid in self._captures_index
and not self._captures_index[uuid].incomplete_redirects)]
captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
if captures_to_get:
@ -593,60 +582,47 @@ class Lookyloo():
if not c:
continue
c = CaptureCache(c)
if c.incomplete_redirects:
self._set_capture_cache(c.capture_dir, force=True)
c = self.capture_cache(c.uuid)
if hasattr(c, 'timestamp'):
all_cache.append(c)
self._captures_index[c.uuid] = c
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
return all_cache
def clear_captures_index_cache(self, uuids: Iterable[str]) -> None:
[self._captures_index.pop(uuid) for uuid in uuids if uuid in self._captures_index]
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
"""Get the cache from redis.
NOTE: Doesn't try to build the pickle"""
"""Get the cache from redis."""
if capture_uuid in self._captures_index:
if self._captures_index[capture_uuid].incomplete_redirects:
# Try to rebuild the cache
capture_dir = self._get_capture_dir(capture_uuid)
cached = self._set_capture_cache(capture_dir)
self._captures_index[capture_uuid] = CaptureCache(cached)
return self._captures_index[capture_uuid]
capture_dir = self._get_capture_dir(capture_uuid)
cached: Dict[str, Any] = self.redis.hgetall(str(capture_dir))
if not cached:
self.logger.warning(f'No cache available for {capture_dir}.')
if not capture_dir:
self.logger.warning(f'No directory for {capture_uuid}.')
return None
cached = self.redis.hgetall(str(capture_dir))
if not cached:
cached = self._set_capture_cache(capture_dir)
try:
cc = CaptureCache(cached)
self._captures_index[cc.uuid] = cc
return cc
self._captures_index[capture_uuid] = CaptureCache(cached)
return self._captures_index[capture_uuid]
except LookylooException as e:
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
return None
def _init_existing_dumps(self) -> None:
'''Initialize the cache for all the captures'''
p = self.redis.pipeline()
for capture_dir in self.capture_dirs:
if capture_dir.exists():
self._set_capture_cache(capture_dir, redis_pipeline=p)
p.set('cache_loaded', 1)
p.execute()
@property
def capture_dirs(self) -> List[Path]:
'''Get all the capture directories, sorder from newest to oldest.'''
for capture_dir in self.capture_dir.iterdir():
if capture_dir.is_dir() and not capture_dir.iterdir():
# Cleanup self.capture_dir of failed runs.
capture_dir.rmdir()
if not (capture_dir / 'uuid').exists():
# Create uuid if missing
with (capture_dir / 'uuid').open('w') as f:
f.write(str(uuid4()))
return sorted(self.capture_dir.iterdir(), reverse=True)
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
'''Use the cache to get a capture directory from a capture UUID'''
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
# Try in the archive
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
to_return = Path(capture_dir)
if not to_return.exists():
# The capture was removed, remove the UUID
@ -970,8 +946,7 @@ class Lookyloo():
cookies = item['cookies']
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies)
self._set_capture_cache(dirpath)
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
return perma_uuid
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:

View File

@ -31,6 +31,7 @@ stop = "bin.stop:main"
rebuild_caches = "bin.rebuild_caches:main"
update = "bin.update:main"
background_indexer = "bin.background_indexer:main"
archiver = "bin.archiver:main"
[tool.poetry.dependencies]