From d2df33aa5c245f5703b3be9bc7103a0b9d0551e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Sat, 9 Mar 2024 15:33:10 +0100
Subject: [PATCH] fix: use a more direct way to index

---
 bin/background_indexer.py | 60 ++++++++++++++++++++++++---------------
 lookyloo/capturecache.py  | 10 ++++---
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/bin/background_indexer.py b/bin/background_indexer.py
index b82a15a5..fcbd4467 100755
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@@ -5,8 +5,12 @@ from __future__ import annotations
 import logging
 import logging.config
 
+from redis import Redis
+from typing import Generator
+
 from lookyloo import Lookyloo, Indexing
-from lookyloo.default import AbstractManager, get_config
+from lookyloo.capturecache import get_pickle_path
+from lookyloo.default import AbstractManager, get_config, get_socket_path
 from lookyloo.exceptions import NoValidHarFile
 
 
@@ -18,6 +22,7 @@ class BackgroundIndexer(AbstractManager):
     def __init__(self, full: bool=False, loglevel: int | None=None):
         super().__init__(loglevel)
         self.lookyloo = Lookyloo()
+        self.is_public_instance = get_config('generic', 'public_instance')
         self.full_indexer = full
         self.indexing = Indexing(full_index=self.full_indexer)
         if self.full_indexer:
@@ -25,52 +30,61 @@ class BackgroundIndexer(AbstractManager):
         else:
             self.script_name = 'background_indexer'
 
+        # Redis connector so we don't use the one from Lookyloo
+        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
+
     def _to_run_forever(self) -> None:
         self._check_indexes()
         # Don't need the cache in this class.
         self.lookyloo.clear_tree_cache()
 
+    def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]:
+        # NOTE: only get the non-archived captures for now.
+        for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
+            if not self.full_indexer:
+                # If we're not running the full indexer, check if the capture should be indexed.
+                if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
+                    # Capture unindexed
+                    continue
+
+            if get_pickle_path(directory) is None:
+                # pickle isn't ready, we can't index.
+                continue
+            indexed = self.indexing.capture_indexed(uuid)
+            if all(indexed):
+                continue
+            yield indexed, uuid
+
     def _check_indexes(self) -> None:
         if not self.indexing.can_index:
             # There is no reason to run this method in multiple scripts.
             self.logger.info('Indexing already ongoing in another process.')
             return None
         self.logger.info(f'Check {self.script_name}...')
-        for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
-            if not self.full_indexer:
-                # If we're not running the full indexer, check if the capture should be indexed.
-                if self.lookyloo.is_public_instance and cache.no_index:
-                    # Capture unindexed
-                    continue
-            if not cache.tree_ready:
-                # pickle isn't ready, we can't index.
-                continue
-            indexed = self.indexing.capture_indexed(cache.uuid)
-            if all(indexed):
-                continue
+        for indexed, uuid_to_index in self._to_index_no_cache():
             try:
-                ct = self.lookyloo.get_crawled_tree(cache.uuid)
+                ct = self.lookyloo.get_crawled_tree(uuid_to_index)
             except NoValidHarFile:
-                self.logger.warning(f'Broken pickle for {cache.uuid}')
-                self.lookyloo.remove_pickle(cache.uuid)
+                self.logger.warning(f'Broken pickle for {uuid_to_index}')
+                self.lookyloo.remove_pickle(uuid_to_index)
                 continue
 
             if not indexed[0]:
-                self.logger.info(f'Indexing urls for {cache.uuid}')
+                self.logger.info(f'Indexing urls for {uuid_to_index}')
                 self.indexing.index_url_capture(ct)
             if not indexed[1]:
-                self.logger.info(f'Indexing resources for {cache.uuid}')
+                self.logger.info(f'Indexing resources for {uuid_to_index}')
                 self.indexing.index_body_hashes_capture(ct)
             if not indexed[2]:
-                self.logger.info(f'Indexing cookies for {cache.uuid}')
+                self.logger.info(f'Indexing cookies for {uuid_to_index}')
                 self.indexing.index_cookies_capture(ct)
             if not indexed[3]:
-                self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
+                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
                 self.indexing.index_http_headers_hashes_capture(ct)
             if not indexed[4]:
-                self.logger.info(f'Indexing favicons for {cache.uuid}')
-                favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
-                self.indexing.index_favicons_capture(cache.uuid, favicons)
+                self.logger.info(f'Indexing favicons for {uuid_to_index}')
+                favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
+                self.indexing.index_favicons_capture(uuid_to_index, favicons)
             # NOTE: categories aren't taken in account here, should be fixed(?)
             # see indexing.index_categories_capture(capture_uuid, categories)
         self.indexing.indexing_done()
diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py
index d9b839e8..e06330b6 100644
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@@ -95,7 +95,7 @@ class CaptureCache():
 
     @property
     def tree_ready(self) -> bool:
-        return bool(_pickle_path(self.capture_dir))
+        return bool(get_pickle_path(self.capture_dir))
 
     @property
     def tree(self) -> CrawledTree:
@@ -106,7 +106,9 @@ class CaptureCache():
         return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
 
 
-def _pickle_path(capture_dir: Path) -> Path | None:
+def get_pickle_path(capture_dir: Path | str) -> Path | None:
+    if isinstance(capture_dir, str):
+        capture_dir = Path(capture_dir)
     pickle_file_gz = capture_dir / 'tree.pickle.gz'
     if pickle_file_gz.exists():
         return pickle_file_gz
@@ -119,14 +121,14 @@ def _pickle_path(capture_dir: Path) -> Path | None:
 
 
 def remove_pickle_tree(capture_dir: Path) -> None:
-    pickle_path = _pickle_path(capture_dir)
+    pickle_path = get_pickle_path(capture_dir)
     if pickle_path and pickle_path.exists():
         pickle_path.unlink()
 
 
 @lru_cache(maxsize=64)
 def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
-    pickle_path = _pickle_path(capture_dir)
+    pickle_path = get_pickle_path(capture_dir)
     tree = None
     try:
         if pickle_path: