From 58b837cb6cb7bf1dc42646770bddcd899bdc6afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 20 Aug 2021 17:46:22 +0200 Subject: [PATCH] new: Archiver, refactoring. --- bin/archiver.py | 80 ++++++++++++++++++++++++++++++++ bin/background_indexer.py | 10 ++-- lookyloo/lookyloo.py | 97 +++++++++++++++------------------------ pyproject.toml | 1 + 4 files changed, 123 insertions(+), 65 deletions(-) create mode 100755 bin/archiver.py diff --git a/bin/archiver.py b/bin/archiver.py new file mode 100755 index 0000000..2427e0b --- /dev/null +++ b/bin/archiver.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from collections import defaultdict +import csv +from datetime import datetime, timedelta +import logging +from typing import Dict, List, Tuple +from pathlib import Path + +from lookyloo.abstractmanager import AbstractManager +from lookyloo.lookyloo import Lookyloo +from lookyloo.helpers import get_config + +logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', + level=logging.INFO, datefmt='%I:%M:%S') + + +class Archiver(AbstractManager): + + def __init__(self, loglevel: int=logging.INFO): + super().__init__(loglevel) + self.script_name = 'archiver' + + def _to_run_forever(self): + self._archive() + + def _archive(self): + # Initialize the lookyloo class here, no need to keep it in memory all the time. + lookyloo = Lookyloo() + # make sure archived captures dir exists + archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures' + archived_captures_dir.mkdir(parents=True, exist_ok=True) + archive_interval = timedelta(days=get_config('generic', 'archive')) + cut_time = datetime.now() - archive_interval + + # Format: + # { 2020: { 12: [(directory, uuid)] } } + to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list)) + for capture_path in lookyloo.capture_dir.glob('*'): + timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f') + if timestamp >= cut_time: + # do not archive. + continue + with (capture_path / 'uuid').open() as _f: + uuid = _f.read().strip() + to_archive[timestamp.year][timestamp.month].append((capture_path, uuid)) + + archived_uuids = {} + for year, month_captures in to_archive.items(): + for month, captures in month_captures.items(): + dest_dir = archived_captures_dir / str(year) / str(month) + dest_dir.mkdir(parents=True, exist_ok=True) + if (dest_dir / 'index').exists(): + with (dest_dir / 'index').open('r') as _f: + current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)} + else: + current_index = {} + for capture_path, uuid in captures: + current_index[uuid] = capture_path.name + capture_path.rename(dest_dir / capture_path.name) + archived_uuids[uuid] = str(dest_dir / capture_path.name) + with (dest_dir / 'index').open('w') as _f: + index_writer = csv.writer(_f) + for uuid, dirname in current_index.items(): + index_writer.writerow([uuid, dirname]) + + if archived_uuids: + lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys()) + lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids) + lookyloo.clear_captures_index_cache(archived_uuids.keys()) + + +def main(): + a = Archiver() + a.run(sleep_in_sec=3600 * 24) + + +if __name__ == '__main__': + main() diff --git a/bin/background_indexer.py b/bin/background_indexer.py index e6e3af8..68eda51 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager): self._check_indexes() def _build_missing_pickles(self): - for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'): + for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True): if (uuid_path.parent / 'tree.pickle').exists(): continue lock_file = uuid_path.parent / 'lock' @@ -45,15 +45,17 @@ class BackgroundIndexer(AbstractManager): with uuid_path.open() as f: uuid = f.read() + if not self.lookyloo.redis.hexists('lookup_dirs', uuid): + # The capture with this UUID exists, but it is for some reason missing in lookup_dirs + self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) + try: self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') self.lookyloo.get_crawled_tree(uuid) self.lookyloo.trigger_modules(uuid, auto_trigger=True) self.logger.info(f'Pickle for {uuid} build.') except MissingUUID: - # The cache is not up-to-date, but the UUID definitely exists in the captures. - self.logger.warning(f'Unable to find {uuid}, re-triggering the cache.') - self.lookyloo._set_capture_cache(uuid_path.parent, force=True) + self.logger.warning(f'Unable to find {uuid}. That should not happen.') except NoValidHarFile: self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') # The capture is not working, moving it away. diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 8403da1..f184bb4 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -99,9 +99,6 @@ class Lookyloo(): self.context = Context(self.sanejs) self._captures_index: Dict[str, CaptureCache] = {} - if not self.redis.exists('cache_loaded'): - self._init_existing_dumps() - @property def redis(self): return Redis(connection_pool=self.redis_pool) @@ -186,8 +183,6 @@ class Lookyloo(): self._ensure_meta(capture_dir, ct) self._resolve_dns(ct) self.context.contextualize_tree(ct) - # Force update cache of the capture (takes care of the incomplete redirect key) - self._set_capture_cache(capture_dir, force=True) cache = self.capture_cache(capture_uuid) if not cache: raise LookylooException(f'Broken cache for {capture_dir}') @@ -310,13 +305,14 @@ class Lookyloo(): remove_pickle_tree(capture_dir) def rebuild_cache(self) -> None: - '''Flush and rebuild the redis cache. Doesn't remove the pickles.''' + '''Flush and rebuild the redis cache. Doesn't remove the pickles. + The cached captures will be rebuild when loading the index.''' self.redis.flushdb() - self._init_existing_dumps() def rebuild_all(self) -> None: - '''Flush and rebuild the redis cache, and delede all the pickles.''' - [remove_pickle_tree(capture_dir) for capture_dir in self.capture_dirs] # type: ignore + '''Flush and rebuild the redis cache, and delete all the pickles. + The captures will be rebuilt by the background indexer''' + [remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()] # type: ignore self.rebuild_cache() def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode: @@ -468,15 +464,9 @@ class Lookyloo(): to_return[event_id].update(values) return to_return - def _set_capture_cache(self, capture_dir: Path, force: bool=False, redis_pipeline: Optional[Redis]=None) -> None: - '''Populate the redis cache for a capture. Mostly used on the index page.''' - # NOTE: this method is called in the background indexer as a fallback. - if force or not self.redis.exists(str(capture_dir)): - # (re)build cache - pass - else: - return - + def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]: + '''Populate the redis cache for a capture. Mostly used on the index page. + NOTE: Doesn't require the pickle.''' with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() @@ -513,10 +503,7 @@ class Lookyloo(): else: categories = [] - if not redis_pipeline: - p = self.redis.pipeline() - else: - p = redis_pipeline + p = self.redis.pipeline() p.hset('lookup_dirs', uuid, str(capture_dir)) if error_cache: if 'HTTP Error' not in error_cache['error']: @@ -551,10 +538,10 @@ class Lookyloo(): cache['parent'] = f.read().strip() p.hmset(str(capture_dir), cache) - if not redis_pipeline: - p.execute() + p.execute() # If the cache is re-created for some reason, pop from the local cache. self._captures_index.pop(uuid, None) + return cache def hide_capture(self, capture_uuid: str, /) -> None: """Add the capture in the hidden pool (not shown on the front page) @@ -580,7 +567,9 @@ class Lookyloo(): # No captures at all on the instance return [] - all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if uuid in self._captures_index and not self._captures_index[uuid].incomplete_redirects] + all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids + if (uuid in self._captures_index + and not self._captures_index[uuid].incomplete_redirects)] captures_to_get = set(capture_uuids) - set(self._captures_index.keys()) if captures_to_get: @@ -593,60 +582,47 @@ class Lookyloo(): if not c: continue c = CaptureCache(c) - if c.incomplete_redirects: - self._set_capture_cache(c.capture_dir, force=True) - c = self.capture_cache(c.uuid) if hasattr(c, 'timestamp'): all_cache.append(c) self._captures_index[c.uuid] = c all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True) return all_cache + def clear_captures_index_cache(self, uuids: Iterable[str]) -> None: + [self._captures_index.pop(uuid) for uuid in uuids if uuid in self._captures_index] + def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]: - """Get the cache from redis. - NOTE: Doesn't try to build the pickle""" + """Get the cache from redis.""" if capture_uuid in self._captures_index: + if self._captures_index[capture_uuid].incomplete_redirects: + # Try to rebuild the cache + capture_dir = self._get_capture_dir(capture_uuid) + cached = self._set_capture_cache(capture_dir) + self._captures_index[capture_uuid] = CaptureCache(cached) return self._captures_index[capture_uuid] capture_dir = self._get_capture_dir(capture_uuid) - cached: Dict[str, Any] = self.redis.hgetall(str(capture_dir)) - if not cached: - self.logger.warning(f'No cache available for {capture_dir}.') + if not capture_dir: + self.logger.warning(f'No directory for {capture_uuid}.') return None + + cached = self.redis.hgetall(str(capture_dir)) + if not cached: + cached = self._set_capture_cache(capture_dir) try: - cc = CaptureCache(cached) - self._captures_index[cc.uuid] = cc - return cc + self._captures_index[capture_uuid] = CaptureCache(cached) + return self._captures_index[capture_uuid] except LookylooException as e: self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}') return None - def _init_existing_dumps(self) -> None: - '''Initialize the cache for all the captures''' - p = self.redis.pipeline() - for capture_dir in self.capture_dirs: - if capture_dir.exists(): - self._set_capture_cache(capture_dir, redis_pipeline=p) - p.set('cache_loaded', 1) - p.execute() - - @property - def capture_dirs(self) -> List[Path]: - '''Get all the capture directories, sorder from newest to oldest.''' - for capture_dir in self.capture_dir.iterdir(): - if capture_dir.is_dir() and not capture_dir.iterdir(): - # Cleanup self.capture_dir of failed runs. - capture_dir.rmdir() - if not (capture_dir / 'uuid').exists(): - # Create uuid if missing - with (capture_dir / 'uuid').open('w') as f: - f.write(str(uuid4())) - return sorted(self.capture_dir.iterdir(), reverse=True) - def _get_capture_dir(self, capture_uuid: str, /) -> Path: '''Use the cache to get a capture directory from a capture UUID''' capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) if not capture_dir: - raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + # Try in the archive + capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid) + if not capture_dir: + raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') to_return = Path(capture_dir) if not to_return.exists(): # The capture was removed, remove the UUID @@ -970,8 +946,7 @@ class Lookyloo(): cookies = item['cookies'] with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies: json.dump(cookies, _cookies) - - self._set_capture_cache(dirpath) + self.redis.hset('lookup_dirs', perma_uuid, str(dirpath)) return perma_uuid def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]: diff --git a/pyproject.toml b/pyproject.toml index aaf1fde..e051566 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ stop = "bin.stop:main" rebuild_caches = "bin.rebuild_caches:main" update = "bin.update:main" background_indexer = "bin.background_indexer:main" +archiver = "bin.archiver:main" [tool.poetry.dependencies]