From d2df33aa5c245f5703b3be9bc7103a0b9d0551e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Sat, 9 Mar 2024 15:33:10 +0100 Subject: [PATCH] fix: use a more direct way to index --- bin/background_indexer.py | 60 ++++++++++++++++++++++++--------------- lookyloo/capturecache.py | 10 ++++--- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/bin/background_indexer.py b/bin/background_indexer.py index b82a15a5..fcbd4467 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -5,8 +5,12 @@ from __future__ import annotations import logging import logging.config +from redis import Redis +from typing import Generator + from lookyloo import Lookyloo, Indexing -from lookyloo.default import AbstractManager, get_config +from lookyloo.capturecache import get_pickle_path +from lookyloo.default import AbstractManager, get_config, get_socket_path from lookyloo.exceptions import NoValidHarFile @@ -18,6 +22,7 @@ class BackgroundIndexer(AbstractManager): def __init__(self, full: bool=False, loglevel: int | None=None): super().__init__(loglevel) self.lookyloo = Lookyloo() + self.is_public_instance = get_config('generic', 'public_instance') self.full_indexer = full self.indexing = Indexing(full_index=self.full_indexer) if self.full_indexer: @@ -25,52 +30,61 @@ class BackgroundIndexer(AbstractManager): else: self.script_name = 'background_indexer' + # Redis connector so we don't use the one from Lookyloo + self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) + def _to_run_forever(self) -> None: self._check_indexes() # Don't need the cache in this class. self.lookyloo.clear_tree_cache() + def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]: + # NOTE: only get the non-archived captures for now. + for uuid, directory in self.redis.hscan_iter('lookup_dirs'): + if not self.full_indexer: + # If we're not running the full indexer, check if the capture should be indexed. + if self.is_public_instance and self.redis.hexists(directory, 'no_index'): + # Capture unindexed + continue + + if get_pickle_path(directory) is None: + # pickle isn't ready, we can't index. + continue + indexed = self.indexing.capture_indexed(uuid) + if all(indexed): + continue + yield indexed, uuid + def _check_indexes(self) -> None: if not self.indexing.can_index: # There is no reason to run this method in multiple scripts. self.logger.info('Indexing already ongoing in another process.') return None self.logger.info(f'Check {self.script_name}...') - for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False): - if not self.full_indexer: - # If we're not running the full indexer, check if the capture should be indexed. - if self.lookyloo.is_public_instance and cache.no_index: - # Capture unindexed - continue - if not cache.tree_ready: - # pickle isn't ready, we can't index. - continue - indexed = self.indexing.capture_indexed(cache.uuid) - if all(indexed): - continue + for indexed, uuid_to_index in self._to_index_no_cache(): try: - ct = self.lookyloo.get_crawled_tree(cache.uuid) + ct = self.lookyloo.get_crawled_tree(uuid_to_index) except NoValidHarFile: - self.logger.warning(f'Broken pickle for {cache.uuid}') - self.lookyloo.remove_pickle(cache.uuid) + self.logger.warning(f'Broken pickle for {uuid_to_index}') + self.lookyloo.remove_pickle(uuid_to_index) continue if not indexed[0]: - self.logger.info(f'Indexing urls for {cache.uuid}') + self.logger.info(f'Indexing urls for {uuid_to_index}') self.indexing.index_url_capture(ct) if not indexed[1]: - self.logger.info(f'Indexing resources for {cache.uuid}') + self.logger.info(f'Indexing resources for {uuid_to_index}') self.indexing.index_body_hashes_capture(ct) if not indexed[2]: - self.logger.info(f'Indexing cookies for {cache.uuid}') + self.logger.info(f'Indexing cookies for {uuid_to_index}') self.indexing.index_cookies_capture(ct) if not indexed[3]: - self.logger.info(f'Indexing HH Hashes for {cache.uuid}') + self.logger.info(f'Indexing HH Hashes for {uuid_to_index}') self.indexing.index_http_headers_hashes_capture(ct) if not indexed[4]: - self.logger.info(f'Indexing favicons for {cache.uuid}') - favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False) - self.indexing.index_favicons_capture(cache.uuid, favicons) + self.logger.info(f'Indexing favicons for {uuid_to_index}') + favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False) + self.indexing.index_favicons_capture(uuid_to_index, favicons) # NOTE: categories aren't taken in account here, should be fixed(?) # see indexing.index_categories_capture(capture_uuid, categories) self.indexing.indexing_done() diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index d9b839e8..e06330b6 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -95,7 +95,7 @@ class CaptureCache(): @property def tree_ready(self) -> bool: - return bool(_pickle_path(self.capture_dir)) + return bool(get_pickle_path(self.capture_dir)) @property def tree(self) -> CrawledTree: @@ -106,7 +106,9 @@ class CaptureCache(): return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger) -def _pickle_path(capture_dir: Path) -> Path | None: +def get_pickle_path(capture_dir: Path | str) -> Path | None: + if isinstance(capture_dir, str): + capture_dir = Path(capture_dir) pickle_file_gz = capture_dir / 'tree.pickle.gz' if pickle_file_gz.exists(): return pickle_file_gz @@ -119,14 +121,14 @@ def _pickle_path(capture_dir: Path) -> Path | None: def remove_pickle_tree(capture_dir: Path) -> None: - pickle_path = _pickle_path(capture_dir) + pickle_path = get_pickle_path(capture_dir) if pickle_path and pickle_path.exists(): pickle_path.unlink() @lru_cache(maxsize=64) def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree: - pickle_path = _pickle_path(capture_dir) + pickle_path = get_pickle_path(capture_dir) tree = None try: if pickle_path: