fix: use a more direct way to index

2024-03-09 15:33:10 +01:00 · 2024-03-09 15:33:10 +01:00 · d2df33aa5c
parent 7bd7488bd4
commit d2df33aa5c
2 changed files with 43 additions and 27 deletions
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@ -5,8 +5,12 @@ from __future__ import annotations
 import logging
 import logging.config

+from redis import Redis
+from typing import Generator
+
 from lookyloo import Lookyloo, Indexing
-from lookyloo.default import AbstractManager, get_config
+from lookyloo.capturecache import get_pickle_path
+from lookyloo.default import AbstractManager, get_config, get_socket_path
 from lookyloo.exceptions import NoValidHarFile


@ -18,6 +22,7 @@ class BackgroundIndexer(AbstractManager):
    def __init__(self, full: bool=False, loglevel: int | None=None):
        super().__init__(loglevel)
        self.lookyloo = Lookyloo()
+        self.is_public_instance = get_config('generic', 'public_instance')
        self.full_indexer = full
        self.indexing = Indexing(full_index=self.full_indexer)
        if self.full_indexer:
@ -25,52 +30,61 @@ class BackgroundIndexer(AbstractManager):
        else:
            self.script_name = 'background_indexer'

+        # Redis connector so we don't use the one from Lookyloo
+        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
+
    def _to_run_forever(self) -> None:
        self._check_indexes()
        # Don't need the cache in this class.
        self.lookyloo.clear_tree_cache()

+    def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]:
+        # NOTE: only get the non-archived captures for now.
+        for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
+            if not self.full_indexer:
+                # If we're not running the full indexer, check if the capture should be indexed.
+                if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
+                    # Capture unindexed
+                    continue
+
+            if get_pickle_path(directory) is None:
+                # pickle isn't ready, we can't index.
+                continue
+            indexed = self.indexing.capture_indexed(uuid)
+            if all(indexed):
+                continue
+            yield indexed, uuid
+
    def _check_indexes(self) -> None:
        if not self.indexing.can_index:
            # There is no reason to run this method in multiple scripts.
            self.logger.info('Indexing already ongoing in another process.')
            return None
        self.logger.info(f'Check {self.script_name}...')
-        for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
-            if not self.full_indexer:
-                # If we're not running the full indexer, check if the capture should be indexed.
-                if self.lookyloo.is_public_instance and cache.no_index:
-                    # Capture unindexed
-                    continue
-            if not cache.tree_ready:
-                # pickle isn't ready, we can't index.
-                continue
-            indexed = self.indexing.capture_indexed(cache.uuid)
-            if all(indexed):
-                continue
+        for indexed, uuid_to_index in self._to_index_no_cache():
            try:
-                ct = self.lookyloo.get_crawled_tree(cache.uuid)
+                ct = self.lookyloo.get_crawled_tree(uuid_to_index)
            except NoValidHarFile:
-                self.logger.warning(f'Broken pickle for {cache.uuid}')
-                self.lookyloo.remove_pickle(cache.uuid)
+                self.logger.warning(f'Broken pickle for {uuid_to_index}')
+                self.lookyloo.remove_pickle(uuid_to_index)
                continue

            if not indexed[0]:
-                self.logger.info(f'Indexing urls for {cache.uuid}')
+                self.logger.info(f'Indexing urls for {uuid_to_index}')
                self.indexing.index_url_capture(ct)
            if not indexed[1]:
-                self.logger.info(f'Indexing resources for {cache.uuid}')
+                self.logger.info(f'Indexing resources for {uuid_to_index}')
                self.indexing.index_body_hashes_capture(ct)
            if not indexed[2]:
-                self.logger.info(f'Indexing cookies for {cache.uuid}')
+                self.logger.info(f'Indexing cookies for {uuid_to_index}')
                self.indexing.index_cookies_capture(ct)
            if not indexed[3]:
-                self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
+                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
                self.indexing.index_http_headers_hashes_capture(ct)
            if not indexed[4]:
-                self.logger.info(f'Indexing favicons for {cache.uuid}')
-                favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
-                self.indexing.index_favicons_capture(cache.uuid, favicons)
+                self.logger.info(f'Indexing favicons for {uuid_to_index}')
+                favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
+                self.indexing.index_favicons_capture(uuid_to_index, favicons)
            # NOTE: categories aren't taken in account here, should be fixed(?)
            # see indexing.index_categories_capture(capture_uuid, categories)
        self.indexing.indexing_done()
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@ -95,7 +95,7 @@ class CaptureCache():

    @property
    def tree_ready(self) -> bool:
-        return bool(_pickle_path(self.capture_dir))
+        return bool(get_pickle_path(self.capture_dir))

    @property
    def tree(self) -> CrawledTree:
@ -106,7 +106,9 @@ class CaptureCache():
        return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)


-def _pickle_path(capture_dir: Path) -> Path | None:
+def get_pickle_path(capture_dir: Path | str) -> Path | None:
+    if isinstance(capture_dir, str):
+        capture_dir = Path(capture_dir)
    pickle_file_gz = capture_dir / 'tree.pickle.gz'
    if pickle_file_gz.exists():
        return pickle_file_gz
@ -119,14 +121,14 @@ def _pickle_path(capture_dir: Path) -> Path | None:


 def remove_pickle_tree(capture_dir: Path) -> None:
-    pickle_path = _pickle_path(capture_dir)
+    pickle_path = get_pickle_path(capture_dir)
    if pickle_path and pickle_path.exists():
        pickle_path.unlink()


@lru_cache(maxsize=64)
 def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
-    pickle_path = _pickle_path(capture_dir)
+    pickle_path = get_pickle_path(capture_dir)
    tree = None
    try:
        if pickle_path: