mirror of https://github.com/CIRCL/lookyloo
fix: use a more direct way to index
parent
7bd7488bd4
commit
d2df33aa5c
|
@ -5,8 +5,12 @@ from __future__ import annotations
|
|||
import logging
|
||||
import logging.config
|
||||
|
||||
from redis import Redis
|
||||
from typing import Generator
|
||||
|
||||
from lookyloo import Lookyloo, Indexing
|
||||
from lookyloo.default import AbstractManager, get_config
|
||||
from lookyloo.capturecache import get_pickle_path
|
||||
from lookyloo.default import AbstractManager, get_config, get_socket_path
|
||||
from lookyloo.exceptions import NoValidHarFile
|
||||
|
||||
|
||||
|
@ -18,6 +22,7 @@ class BackgroundIndexer(AbstractManager):
|
|||
def __init__(self, full: bool=False, loglevel: int | None=None):
|
||||
super().__init__(loglevel)
|
||||
self.lookyloo = Lookyloo()
|
||||
self.is_public_instance = get_config('generic', 'public_instance')
|
||||
self.full_indexer = full
|
||||
self.indexing = Indexing(full_index=self.full_indexer)
|
||||
if self.full_indexer:
|
||||
|
@ -25,52 +30,61 @@ class BackgroundIndexer(AbstractManager):
|
|||
else:
|
||||
self.script_name = 'background_indexer'
|
||||
|
||||
# Redis connector so we don't use the one from Lookyloo
|
||||
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
|
||||
def _to_run_forever(self) -> None:
|
||||
self._check_indexes()
|
||||
# Don't need the cache in this class.
|
||||
self.lookyloo.clear_tree_cache()
|
||||
|
||||
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]:
|
||||
# NOTE: only get the non-archived captures for now.
|
||||
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
|
||||
if not self.full_indexer:
|
||||
# If we're not running the full indexer, check if the capture should be indexed.
|
||||
if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
|
||||
# Capture unindexed
|
||||
continue
|
||||
|
||||
if get_pickle_path(directory) is None:
|
||||
# pickle isn't ready, we can't index.
|
||||
continue
|
||||
indexed = self.indexing.capture_indexed(uuid)
|
||||
if all(indexed):
|
||||
continue
|
||||
yield indexed, uuid
|
||||
|
||||
def _check_indexes(self) -> None:
|
||||
if not self.indexing.can_index:
|
||||
# There is no reason to run this method in multiple scripts.
|
||||
self.logger.info('Indexing already ongoing in another process.')
|
||||
return None
|
||||
self.logger.info(f'Check {self.script_name}...')
|
||||
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
||||
if not self.full_indexer:
|
||||
# If we're not running the full indexer, check if the capture should be indexed.
|
||||
if self.lookyloo.is_public_instance and cache.no_index:
|
||||
# Capture unindexed
|
||||
continue
|
||||
if not cache.tree_ready:
|
||||
# pickle isn't ready, we can't index.
|
||||
continue
|
||||
indexed = self.indexing.capture_indexed(cache.uuid)
|
||||
if all(indexed):
|
||||
continue
|
||||
for indexed, uuid_to_index in self._to_index_no_cache():
|
||||
try:
|
||||
ct = self.lookyloo.get_crawled_tree(cache.uuid)
|
||||
ct = self.lookyloo.get_crawled_tree(uuid_to_index)
|
||||
except NoValidHarFile:
|
||||
self.logger.warning(f'Broken pickle for {cache.uuid}')
|
||||
self.lookyloo.remove_pickle(cache.uuid)
|
||||
self.logger.warning(f'Broken pickle for {uuid_to_index}')
|
||||
self.lookyloo.remove_pickle(uuid_to_index)
|
||||
continue
|
||||
|
||||
if not indexed[0]:
|
||||
self.logger.info(f'Indexing urls for {cache.uuid}')
|
||||
self.logger.info(f'Indexing urls for {uuid_to_index}')
|
||||
self.indexing.index_url_capture(ct)
|
||||
if not indexed[1]:
|
||||
self.logger.info(f'Indexing resources for {cache.uuid}')
|
||||
self.logger.info(f'Indexing resources for {uuid_to_index}')
|
||||
self.indexing.index_body_hashes_capture(ct)
|
||||
if not indexed[2]:
|
||||
self.logger.info(f'Indexing cookies for {cache.uuid}')
|
||||
self.logger.info(f'Indexing cookies for {uuid_to_index}')
|
||||
self.indexing.index_cookies_capture(ct)
|
||||
if not indexed[3]:
|
||||
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
||||
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
|
||||
self.indexing.index_http_headers_hashes_capture(ct)
|
||||
if not indexed[4]:
|
||||
self.logger.info(f'Indexing favicons for {cache.uuid}')
|
||||
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
||||
self.indexing.index_favicons_capture(cache.uuid, favicons)
|
||||
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
||||
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
|
||||
self.indexing.index_favicons_capture(uuid_to_index, favicons)
|
||||
# NOTE: categories aren't taken in account here, should be fixed(?)
|
||||
# see indexing.index_categories_capture(capture_uuid, categories)
|
||||
self.indexing.indexing_done()
|
||||
|
|
|
@ -95,7 +95,7 @@ class CaptureCache():
|
|||
|
||||
@property
|
||||
def tree_ready(self) -> bool:
|
||||
return bool(_pickle_path(self.capture_dir))
|
||||
return bool(get_pickle_path(self.capture_dir))
|
||||
|
||||
@property
|
||||
def tree(self) -> CrawledTree:
|
||||
|
@ -106,7 +106,9 @@ class CaptureCache():
|
|||
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
|
||||
|
||||
|
||||
def _pickle_path(capture_dir: Path) -> Path | None:
|
||||
def get_pickle_path(capture_dir: Path | str) -> Path | None:
|
||||
if isinstance(capture_dir, str):
|
||||
capture_dir = Path(capture_dir)
|
||||
pickle_file_gz = capture_dir / 'tree.pickle.gz'
|
||||
if pickle_file_gz.exists():
|
||||
return pickle_file_gz
|
||||
|
@ -119,14 +121,14 @@ def _pickle_path(capture_dir: Path) -> Path | None:
|
|||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_path = _pickle_path(capture_dir)
|
||||
pickle_path = get_pickle_path(capture_dir)
|
||||
if pickle_path and pickle_path.exists():
|
||||
pickle_path.unlink()
|
||||
|
||||
|
||||
@lru_cache(maxsize=64)
|
||||
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
|
||||
pickle_path = _pickle_path(capture_dir)
|
||||
pickle_path = get_pickle_path(capture_dir)
|
||||
tree = None
|
||||
try:
|
||||
if pickle_path:
|
||||
|
|
Loading…
Reference in New Issue