From b3541e0e78f25f14902c88ddd8a0faeb034b6df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 12 Mar 2021 16:53:00 +0100 Subject: [PATCH] new: background indexer --- bin/background_indexer.py | 85 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 86 insertions(+) create mode 100755 bin/background_indexer.py diff --git a/bin/background_indexer.py b/bin/background_indexer.py new file mode 100755 index 0000000..1cc9a59 --- /dev/null +++ b/bin/background_indexer.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import logging + +from lookyloo.abstractmanager import AbstractManager +from lookyloo.helpers import set_running, unset_running +from lookyloo.lookyloo import Lookyloo +from lookyloo.exceptions import NoValidHarFile + +logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', + level=logging.INFO, datefmt='%I:%M:%S') + + +class BackgroundIndexer(AbstractManager): + + def __init__(self, loglevel: int=logging.INFO): + super().__init__(loglevel) + self.lookyloo = Lookyloo() + # make sure discarded captures dir exists + self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' + self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) + + def _to_run_forever(self): + set_running('background_indexer') + self._build_missing_pickles() + self._check_indexes() + unset_running('background_indexer') + + def _build_missing_pickles(self): + for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'): + if (uuid_path.parent / 'tree.pickle').exists(): + continue + with uuid_path.open() as f: + uuid = f.read() + try: + self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') + self.lookyloo.get_crawled_tree(uuid) + except NoValidHarFile: + self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') + # The capture is not working, moving it away. + self.lookyloo.redis.hdel('lookup_dirs', uuid) + uuid_path.parent.rename(self.discarded_captures_dir / uuid_path.parent.name) + + def _check_indexes(self): + for uuid in self.lookyloo.capture_uuids: + cache = self.lookyloo.capture_cache(uuid) + if not cache: + # Shouldn't happen, but ignore in this process + continue + if self.lookyloo.is_public_instance and cache.no_index: + # Capture unindexed + continue + p = self.lookyloo.indexing.redis.pipeline() + p.sismember('indexed_urls', uuid) + p.sismember('indexed_body_hashes', uuid) + p.sismember('indexed_cookies', uuid) + indexed = p.execute() + if all(indexed): + continue + try: + ct = self.lookyloo.get_crawled_tree(uuid) + except NoValidHarFile: + self.logger.warning(f'Broken pickle for {uuid}') + self.lookyloo.remove_pickle(uuid) + continue + + if not indexed[0]: + self.logger.info(f'Indexing urls for {uuid}') + self.lookyloo.indexing.index_url_capture(ct) + if not indexed[1]: + self.logger.info(f'Indexing resources for {uuid}') + self.lookyloo.indexing.index_body_hashes_capture(ct) + if not indexed[2]: + self.logger.info(f'Indexing cookies for {uuid}') + self.lookyloo.indexing.index_cookies_capture(ct) + + +def main(): + i = BackgroundIndexer() + i.run(sleep_in_sec=60) + + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml index e944b04..55146dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ shutdown = "bin.shutdown:main" stop = "bin.stop:main" rebuild_caches = "bin.rebuild_caches:main" update = "bin.update:main" +background_indexer = "bin.background_indexer:main" [tool.poetry.dependencies]