fix: use a more direct way to index

pull/893/head
Raphaël Vinot 2024-03-09 15:33:10 +01:00
parent 7bd7488bd4
commit d2df33aa5c
2 changed files with 43 additions and 27 deletions

View File

@ -5,8 +5,12 @@ from __future__ import annotations
import logging import logging
import logging.config import logging.config
from redis import Redis
from typing import Generator
from lookyloo import Lookyloo, Indexing from lookyloo import Lookyloo, Indexing
from lookyloo.default import AbstractManager, get_config from lookyloo.capturecache import get_pickle_path
from lookyloo.default import AbstractManager, get_config, get_socket_path
from lookyloo.exceptions import NoValidHarFile from lookyloo.exceptions import NoValidHarFile
@ -18,6 +22,7 @@ class BackgroundIndexer(AbstractManager):
def __init__(self, full: bool=False, loglevel: int | None=None): def __init__(self, full: bool=False, loglevel: int | None=None):
super().__init__(loglevel) super().__init__(loglevel)
self.lookyloo = Lookyloo() self.lookyloo = Lookyloo()
self.is_public_instance = get_config('generic', 'public_instance')
self.full_indexer = full self.full_indexer = full
self.indexing = Indexing(full_index=self.full_indexer) self.indexing = Indexing(full_index=self.full_indexer)
if self.full_indexer: if self.full_indexer:
@ -25,52 +30,61 @@ class BackgroundIndexer(AbstractManager):
else: else:
self.script_name = 'background_indexer' self.script_name = 'background_indexer'
# Redis connector so we don't use the one from Lookyloo
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def _to_run_forever(self) -> None: def _to_run_forever(self) -> None:
self._check_indexes() self._check_indexes()
# Don't need the cache in this class. # Don't need the cache in this class.
self.lookyloo.clear_tree_cache() self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
# Capture unindexed
continue
if get_pickle_path(directory) is None:
# pickle isn't ready, we can't index.
continue
indexed = self.indexing.capture_indexed(uuid)
if all(indexed):
continue
yield indexed, uuid
def _check_indexes(self) -> None: def _check_indexes(self) -> None:
if not self.indexing.can_index: if not self.indexing.can_index:
# There is no reason to run this method in multiple scripts. # There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.') self.logger.info('Indexing already ongoing in another process.')
return None return None
self.logger.info(f'Check {self.script_name}...') self.logger.info(f'Check {self.script_name}...')
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False): for indexed, uuid_to_index in self._to_index_no_cache():
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
if not cache.tree_ready:
# pickle isn't ready, we can't index.
continue
indexed = self.indexing.capture_indexed(cache.uuid)
if all(indexed):
continue
try: try:
ct = self.lookyloo.get_crawled_tree(cache.uuid) ct = self.lookyloo.get_crawled_tree(uuid_to_index)
except NoValidHarFile: except NoValidHarFile:
self.logger.warning(f'Broken pickle for {cache.uuid}') self.logger.warning(f'Broken pickle for {uuid_to_index}')
self.lookyloo.remove_pickle(cache.uuid) self.lookyloo.remove_pickle(uuid_to_index)
continue continue
if not indexed[0]: if not indexed[0]:
self.logger.info(f'Indexing urls for {cache.uuid}') self.logger.info(f'Indexing urls for {uuid_to_index}')
self.indexing.index_url_capture(ct) self.indexing.index_url_capture(ct)
if not indexed[1]: if not indexed[1]:
self.logger.info(f'Indexing resources for {cache.uuid}') self.logger.info(f'Indexing resources for {uuid_to_index}')
self.indexing.index_body_hashes_capture(ct) self.indexing.index_body_hashes_capture(ct)
if not indexed[2]: if not indexed[2]:
self.logger.info(f'Indexing cookies for {cache.uuid}') self.logger.info(f'Indexing cookies for {uuid_to_index}')
self.indexing.index_cookies_capture(ct) self.indexing.index_cookies_capture(ct)
if not indexed[3]: if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {cache.uuid}') self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
self.indexing.index_http_headers_hashes_capture(ct) self.indexing.index_http_headers_hashes_capture(ct)
if not indexed[4]: if not indexed[4]:
self.logger.info(f'Indexing favicons for {cache.uuid}') self.logger.info(f'Indexing favicons for {uuid_to_index}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False) favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
self.indexing.index_favicons_capture(cache.uuid, favicons) self.indexing.index_favicons_capture(uuid_to_index, favicons)
# NOTE: categories aren't taken in account here, should be fixed(?) # NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories) # see indexing.index_categories_capture(capture_uuid, categories)
self.indexing.indexing_done() self.indexing.indexing_done()

View File

@ -95,7 +95,7 @@ class CaptureCache():
@property @property
def tree_ready(self) -> bool: def tree_ready(self) -> bool:
return bool(_pickle_path(self.capture_dir)) return bool(get_pickle_path(self.capture_dir))
@property @property
def tree(self) -> CrawledTree: def tree(self) -> CrawledTree:
@ -106,7 +106,9 @@ class CaptureCache():
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger) return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
def _pickle_path(capture_dir: Path) -> Path | None: def get_pickle_path(capture_dir: Path | str) -> Path | None:
if isinstance(capture_dir, str):
capture_dir = Path(capture_dir)
pickle_file_gz = capture_dir / 'tree.pickle.gz' pickle_file_gz = capture_dir / 'tree.pickle.gz'
if pickle_file_gz.exists(): if pickle_file_gz.exists():
return pickle_file_gz return pickle_file_gz
@ -119,14 +121,14 @@ def _pickle_path(capture_dir: Path) -> Path | None:
def remove_pickle_tree(capture_dir: Path) -> None: def remove_pickle_tree(capture_dir: Path) -> None:
pickle_path = _pickle_path(capture_dir) pickle_path = get_pickle_path(capture_dir)
if pickle_path and pickle_path.exists(): if pickle_path and pickle_path.exists():
pickle_path.unlink() pickle_path.unlink()
@lru_cache(maxsize=64) @lru_cache(maxsize=64)
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree: def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
pickle_path = _pickle_path(capture_dir) pickle_path = get_pickle_path(capture_dir)
tree = None tree = None
try: try:
if pickle_path: if pickle_path: