From 1085932ad227d2c2cc4a12f00e7c4773665550f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Thu, 29 Aug 2024 13:32:38 +0200
Subject: [PATCH] new: Force indexing of a specific capture from the UI

This should also make the indexing a lot faster.
---
 bin/background_indexer.py       | 69 +++++---------------------
 lookyloo/capturecache.py        | 63 ++----------------------
 lookyloo/helpers.py             | 68 ++++++++++++++++++++++++--
 lookyloo/indexing.py            | 87 ++++++++++++++++++++++++++++-----
 website/web/__init__.py         | 14 ++++++
 website/web/templates/tree.html |  4 ++
 6 files changed, 172 insertions(+), 133 deletions(-)

diff --git a/bin/background_indexer.py b/bin/background_indexer.py
index 3a706816..8089f31d 100755
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@@ -4,14 +4,12 @@ from __future__ import annotations
 
 import logging
 import logging.config
+from pathlib import Path
 
 from redis import Redis
-from typing import Generator
 
-from lookyloo import Lookyloo, Indexing
-from lookyloo.capturecache import get_pickle_path
+from lookyloo import Indexing
 from lookyloo.default import AbstractManager, get_config, get_socket_path
-from lookyloo.exceptions import NoValidHarFile
 
 
 logging.config.dictConfig(get_config('logging'))
@@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager):
 
     def __init__(self, full: bool=False, loglevel: int | None=None):
         super().__init__(loglevel)
-        self.lookyloo = Lookyloo(cache_max_size=1)
         self.is_public_instance = get_config('generic', 'public_instance')
         self.full_indexer = full
         self.indexing = Indexing(full_index=self.full_indexer)
@@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager):
 
     def _to_run_forever(self) -> None:
         self._check_indexes()
-        # Don't need the cache in this class.
-        self.lookyloo.clear_tree_cache()
-
-    def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
-        # NOTE: only get the non-archived captures for now.
-        for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
-            if not self.full_indexer:
-                # If we're not running the full indexer, check if the capture should be indexed.
-                if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
-                    # Capture unindexed
-                    continue
-
-            if get_pickle_path(directory) is None:
-                # pickle isn't ready, we can't index.
-                continue
-            indexed = self.indexing.capture_indexed(uuid)
-            if all(indexed):
-                continue
-            yield indexed, uuid
 
     def _check_indexes(self) -> None:
-        if not self.indexing.can_index:
+        if not self.indexing.can_index():
             # There is no reason to run this method in multiple scripts.
             self.logger.info('Indexing already ongoing in another process.')
             return None
         self.logger.info(f'Check {self.script_name}...')
-        for indexed, uuid_to_index in self._to_index_no_cache():
-            try:
-                ct = self.lookyloo.get_crawled_tree(uuid_to_index)
-            except NoValidHarFile:
-                self.logger.warning(f'Broken pickle for {uuid_to_index}')
-                self.lookyloo.remove_pickle(uuid_to_index)
-                continue
+        # NOTE: only get the non-archived captures for now.
+        for uuid, d in self.redis.hscan_iter('lookup_dirs'):
+            if not self.full_indexer:
+                # If we're not running the full indexer, check if the capture should be indexed.
+                if self.is_public_instance and self.redis.hexists(d, 'no_index'):
+                    # Capture unindexed
+                    continue
 
-            if not indexed[0]:
-                self.logger.info(f'Indexing urls for {uuid_to_index}')
-                self.indexing.index_url_capture(ct)
-            if not indexed[1]:
-                self.logger.info(f'Indexing resources for {uuid_to_index}')
-                self.indexing.index_body_hashes_capture(ct)
-            if not indexed[2]:
-                self.logger.info(f'Indexing cookies for {uuid_to_index}')
-                self.indexing.index_cookies_capture(ct)
-            if not indexed[3]:
-                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
-                self.indexing.index_http_headers_hashes_capture(ct)
-            if not indexed[4]:
-                self.logger.info(f'Indexing favicons for {uuid_to_index}')
-                favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
-                self.indexing.index_favicons_capture(uuid_to_index, favicons)
-            if not indexed[5]:
-                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
-                self.indexing.index_identifiers_capture(ct)
-            if not indexed[6]:
-                self.logger.info(f'Indexing categories for {uuid_to_index}')
-                categories = self.lookyloo.categories_capture(uuid_to_index)
-                self.indexing.index_categories_capture(uuid_to_index, categories)
-            if not indexed[7]:
-                self.logger.info(f'Indexing hash types for {uuid_to_index}')
-                self.indexing.index_capture_hashes_types(ct)
+            self.indexing.index_capture(uuid, Path(d))
         self.indexing.indexing_done()
         self.logger.info('... done.')
 
diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py
index 7f257f0c..9ff76946 100644
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@@ -16,8 +16,8 @@ import time
 from collections import OrderedDict
 from collections.abc import Mapping
 from datetime import datetime
-from functools import lru_cache, _CacheInfo as CacheInfo
-from logging import Logger, LoggerAdapter
+from functools import _CacheInfo as CacheInfo
+from logging import LoggerAdapter
 from pathlib import Path
 from typing import Any, MutableMapping, Iterator
 
@@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory  # type: ignore[attr-defined]
 from redis import Redis
 
 from .context import Context
-from .helpers import get_captures_dir, is_locked
+from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
 from .indexing import Indexing
 from .default import LookylooException, try_make_file, get_config
 from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
@@ -106,63 +106,6 @@ class CaptureCache():
         return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
 
 
-def get_pickle_path(capture_dir: Path | str) -> Path | None:
-    if isinstance(capture_dir, str):
-        capture_dir = Path(capture_dir)
-    pickle_file_gz = capture_dir / 'tree.pickle.gz'
-    if pickle_file_gz.exists():
-        return pickle_file_gz
-
-    pickle_file = capture_dir / 'tree.pickle'
-    if pickle_file.exists():
-        return pickle_file
-
-    return None
-
-
-def remove_pickle_tree(capture_dir: Path) -> None:
-    pickle_path = get_pickle_path(capture_dir)
-    if pickle_path and pickle_path.exists():
-        pickle_path.unlink()
-
-
-@lru_cache(maxsize=64)
-def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
-    pickle_path = get_pickle_path(capture_dir)
-    tree = None
-    try:
-        if pickle_path:
-            if pickle_path.suffix == '.gz':
-                with gzip.open(pickle_path, 'rb') as _pg:
-                    tree = pickle.load(_pg)
-            else:  # not a GZ pickle
-                with pickle_path.open('rb') as _p:
-                    tree = pickle.load(_p)
-    except pickle.UnpicklingError:
-        remove_pickle_tree(capture_dir)
-    except EOFError:
-        remove_pickle_tree(capture_dir)
-    except Exception:
-        logger.exception('Unexpected exception when unpickling.')
-        remove_pickle_tree(capture_dir)
-
-    if tree:
-        try:
-            if tree.root_hartree.har.path.exists():
-                return tree
-            else:
-                # The capture was moved.
-                remove_pickle_tree(capture_dir)
-        except Exception as e:
-            logger.warning(f'The pickle is broken, removing: {e}')
-            remove_pickle_tree(capture_dir)
-
-    if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
-        raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
-    # The tree doesn't need to be rebuilt if there are no HAR files.
-    raise NoValidHarFile("Couldn't find HAR files")
-
-
 def serialize_sets(obj: Any) -> Any:
     if isinstance(obj, set):
         return list(obj)
diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py
index 947c34ed..ed0a5c91 100644
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@@ -3,10 +3,12 @@
 from __future__ import annotations
 
 import configparser
+import gzip
 import hashlib
 import json
 import logging
 import os
+import pickle
 import re
 import time
 
@@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date
 from functools import lru_cache, cache
 from importlib.metadata import version
 from io import BufferedIOBase
+from logging import Logger
 from pathlib import Path
 from pydantic import field_validator
 from pydantic_core import from_json
-from typing import Any
+from typing import Any, TYPE_CHECKING
 from urllib.parse import urlparse
 
 
@@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent
 from werkzeug.utils import cached_property
 
 from .default import get_homedir, safe_create_dir, get_config, LookylooException
-from .indexing import Indexing
-# from .exceptions import InvalidCaptureSetting
+from .exceptions import NoValidHarFile, TreeNeedsRebuild
 
+if TYPE_CHECKING:
+    from .indexing import Indexing
 
 logger = logging.getLogger('Lookyloo - Helpers')
 
@@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None:
 
 @cache
 def get_indexing(full: bool=False) -> Indexing:
+    from .indexing import Indexing
     if not get_config('generic', 'index_everything'):
         return Indexing()
     if full:
         return Indexing(full_index=True)
     return Indexing()
+
+
+def get_pickle_path(capture_dir: Path | str) -> Path | None:
+    if isinstance(capture_dir, str):
+        capture_dir = Path(capture_dir)
+    pickle_file_gz = capture_dir / 'tree.pickle.gz'
+    if pickle_file_gz.exists():
+        return pickle_file_gz
+
+    pickle_file = capture_dir / 'tree.pickle'
+    if pickle_file.exists():
+        return pickle_file
+
+    return None
+
+
+def remove_pickle_tree(capture_dir: Path) -> None:
+    pickle_path = get_pickle_path(capture_dir)
+    if pickle_path and pickle_path.exists():
+        pickle_path.unlink()
+
+
+@lru_cache(maxsize=64)
+def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
+    pickle_path = get_pickle_path(capture_dir)
+    tree = None
+    try:
+        if pickle_path:
+            if pickle_path.suffix == '.gz':
+                with gzip.open(pickle_path, 'rb') as _pg:
+                    tree = pickle.load(_pg)
+            else:  # not a GZ pickle
+                with pickle_path.open('rb') as _p:
+                    tree = pickle.load(_p)
+    except pickle.UnpicklingError:
+        remove_pickle_tree(capture_dir)
+    except EOFError:
+        remove_pickle_tree(capture_dir)
+    except Exception:
+        logger.exception('Unexpected exception when unpickling.')
+        remove_pickle_tree(capture_dir)
+
+    if tree:
+        try:
+            if tree.root_hartree.har.path.exists():
+                return tree
+            else:
+                # The capture was moved.
+                remove_pickle_tree(capture_dir)
+        except Exception as e:
+            logger.warning(f'The pickle is broken, removing: {e}')
+            remove_pickle_tree(capture_dir)
+
+    if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
+        raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
+    # The tree doesn't need to be rebuilt if there are no HAR files.
+    raise NoValidHarFile("Couldn't find HAR files")
diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py
index a81ad0bc..1465a069 100644
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@@ -15,13 +15,15 @@ import mmh3
 
 from bs4 import BeautifulSoup
 from hashlib import sha256
+from pathlib import Path
 
 from har2tree import CrawledTree
 from redis import ConnectionPool, Redis
 from redis.connection import UnixDomainSocketConnection
 
+from .exceptions import NoValidHarFile, TreeNeedsRebuild
+from .helpers import load_pickle_tree
 from .default import get_socket_path, get_config
-# from .helpers import get_public_suffix_list
 
 
 class Indexing():
@@ -53,12 +55,17 @@ class Indexing():
     def redis(self) -> Redis:  # type: ignore[type-arg]
         return Redis(connection_pool=self.__redis_pool)
 
-    @property
-    def can_index(self) -> bool:
+    def can_index(self, capture_uuid: str | None=None) -> bool:
+        if capture_uuid:
+            return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True))
+
         return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
 
-    def indexing_done(self) -> None:
-        self.redis.delete('ongoing_indexing')
+    def indexing_done(self, capture_uuid: str | None=None) -> None:
+        if capture_uuid:
+            self.redis.delete(f'ongoing_indexing|{capture_uuid}')
+        else:
+            self.redis.delete('ongoing_indexing')
 
     def force_reindex(self, capture_uuid: str) -> None:
         p = self.redis.pipeline()
@@ -91,6 +98,55 @@ class Indexing():
         # This call for sure returns a tuple of 7 booleans
         return tuple(to_return)  # type: ignore[return-value]
 
+    def index_capture(self, uuid_to_index: str, directory: Path) -> None:
+        if not self.can_index(uuid_to_index):
+            self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ')
+            return
+
+        try:
+            indexed = self.capture_indexed(uuid_to_index)
+            if all(indexed):
+                return
+
+            if not any((directory / pickle_name).exists()
+                       for pickle_name in ['tree.pickle.gz', 'tree.pickle']):
+                self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ')
+                return
+
+            # do the indexing
+            ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger)
+            if not indexed[0]:
+                self.logger.info(f'Indexing urls for {uuid_to_index}')
+                self.index_url_capture(ct)
+            if not indexed[1]:
+                self.logger.info(f'Indexing resources for {uuid_to_index}')
+                self.index_body_hashes_capture(ct)
+            if not indexed[2]:
+                self.logger.info(f'Indexing cookies for {uuid_to_index}')
+                self.index_cookies_capture(ct)
+            if not indexed[3]:
+                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
+                self.index_http_headers_hashes_capture(ct)
+            if not indexed[4]:
+                self.logger.info(f'Indexing favicons for {uuid_to_index}')
+                self.index_favicons_capture(uuid_to_index, directory)
+            if not indexed[5]:
+                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
+                self.index_identifiers_capture(ct)
+            if not indexed[6]:
+                self.logger.info(f'Indexing categories for {uuid_to_index}')
+                self.index_categories_capture(uuid_to_index, directory)
+            if not indexed[7]:
+                self.logger.info(f'Indexing hash types for {uuid_to_index}')
+                self.index_capture_hashes_types(ct)
+
+        except (TreeNeedsRebuild, NoValidHarFile) as e:
+            self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
+        except Exception as e:
+            self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
+        finally:
+            self.indexing_done(uuid_to_index)
+
     # ###### Cookies ######
 
     @property
@@ -349,18 +405,16 @@ class Indexing():
     def favicon_number_captures(self, favicon_sha512: str) -> int:
         return self.redis.scard(f'favicons|{favicon_sha512}|captures')
 
-    def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
+    def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
         if self.redis.sismember('indexed_favicons', capture_uuid):
             # Do not reindex
             return
         self.redis.sadd('indexed_favicons', capture_uuid)
         self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
         pipeline = self.redis.pipeline()
-        with ZipFile(favicons, 'r') as myzip:
-            for name in myzip.namelist():
-                if not name.endswith('.ico'):
-                    continue
-                favicon = myzip.read(name)
+        for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
+            with favicon_path.open('rb') as f:
+                favicon = f.read()
                 if not favicon:
                     # Empty file, ignore.
                     continue
@@ -552,11 +606,20 @@ class Indexing():
     def categories(self) -> set[str]:
         return self.redis.smembers('categories')
 
-    def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
+    def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
         if self.redis.sismember('indexed_categories', capture_uuid):
             # do not reindex
             return
+        # Make sure we don't reindex
         self.redis.sadd('indexed_categories', capture_uuid)
+
+        categ_file = capture_dir / 'categories'
+        if categ_file.exists():
+            with categ_file.open('r') as f:
+                capture_categories = [c.strip() for c in f.readlines()]
+        else:
+            return
+
         added_in_existing_categories = set()
         pipeline = self.redis.pipeline()
         for c in self.categories:
diff --git a/website/web/__init__.py b/website/web/__init__.py
index 188c845a..57ed6174 100644
--- a/website/web/__init__.py
+++ b/website/web/__init__.py
@@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse:
     return redirect(url_for('tree', tree_uuid=tree_uuid))
 
 
+@app.route('/tree/<string:tree_uuid>/trigger_indexing', methods=['POST', 'GET'])
+def trigger_indexing(tree_uuid: str) -> WerkzeugResponse:
+    cache = lookyloo.capture_cache(tree_uuid)
+    if cache and hasattr(cache, 'capture_dir'):
+        get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir)
+    return redirect(url_for('tree', tree_uuid=tree_uuid))
+
+
 @app.route('/tree/<string:tree_uuid>', methods=['GET'])
 @app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET'])
 def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse:
@@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
                 monitoring_collections = []
                 flash(f'Unable to get existing connections from the monitoring : {e}', 'warning')
 
+        # Check if the capture has been indexed yet. Print a warning if not.
+        capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid))
+        if not capture_indexed:
+            flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning')
+
         return render_template('tree.html', tree_json=ct.to_json(),
                                info=cache,
                                tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
@@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
                                confirm_message=confirm_message if confirm_message else 'Tick to confirm.',
                                parent_uuid=cache.parent,
                                has_redirects=True if cache.redirects else False,
+                               capture_indexed=capture_indexed,
                                capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {})
 
     except NoValidHarFile:
diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html
index 1a0a22d7..cbc3db44 100644
--- a/website/web/templates/tree.html
+++ b/website/web/templates/tree.html
@@ -407,6 +407,10 @@
           <div id="tools-menu" class="dropdown">
             <button class="dropbtn">Analytical Tools</button>
             <div id="tools-menu-content" class="dropdown-content">
+              {% if not capture_indexed %}
+                <a href="{{ url_for('trigger_indexing', tree_uuid=tree_uuid) }}" role="button" class="btn btn-warning"
+                   title="The capture isn't (fully) indexed, index now.">Index capture</a>
+              {% endif %}
               <a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}"
                   data-bs-toggle="modal" data-bs-target="#modulesModal" role="button"
                   title="Lookups from supported 3rd party services">Third Party Reports</a>