new: Force indexing of a specific capture from the UI

This should also make the indexing a lot faster.
2024-08-29 13:32:38 +02:00 · 2024-08-29 13:32:38 +02:00 · 1085932ad2
parent 28e81a1eae
commit 1085932ad2
6 changed files with 172 additions and 133 deletions
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@ -4,14 +4,12 @@ from __future__ import annotations
 import logging
 import logging.config
 from pathlib import Path
 from redis import Redis
 from typing import Generator
-from lookyloo import Lookyloo, Indexing
+from lookyloo import Indexing
 from lookyloo.capturecache import get_pickle_path
 from lookyloo.default import AbstractManager, get_config, get_socket_path
 from lookyloo.exceptions import NoValidHarFile
 logging.config.dictConfig(get_config('logging'))
@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager):
    def __init__(self, full: bool=False, loglevel: int | None=None):
        super().__init__(loglevel)
        self.lookyloo = Lookyloo(cache_max_size=1)
        self.is_public_instance = get_config('generic', 'public_instance')
        self.full_indexer = full
        self.indexing = Indexing(full_index=self.full_indexer)
@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager):
    def _to_run_forever(self) -> None:
        self._check_indexes()
        # Don't need the cache in this class.
        self.lookyloo.clear_tree_cache()
    def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
        # NOTE: only get the non-archived captures for now.
        for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
            if not self.full_indexer:
                # If we're not running the full indexer, check if the capture should be indexed.
                if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
                    # Capture unindexed
                    continue
            if get_pickle_path(directory) is None:
                # pickle isn't ready, we can't index.
                continue
            indexed = self.indexing.capture_indexed(uuid)
            if all(indexed):
                continue
            yield indexed, uuid
    def _check_indexes(self) -> None:
-        if not self.indexing.can_index:
+        if not self.indexing.can_index():
            # There is no reason to run this method in multiple scripts.
            self.logger.info('Indexing already ongoing in another process.')
            return None
        self.logger.info(f'Check {self.script_name}...')
-        for indexed, uuid_to_index in self._to_index_no_cache():
+        # NOTE: only get the non-archived captures for now.
-            try:
+        for uuid, d in self.redis.hscan_iter('lookup_dirs'):
-                ct = self.lookyloo.get_crawled_tree(uuid_to_index)
+            if not self.full_indexer:
-            except NoValidHarFile:
+                # If we're not running the full indexer, check if the capture should be indexed.
-                self.logger.warning(f'Broken pickle for {uuid_to_index}')
+                if self.is_public_instance and self.redis.hexists(d, 'no_index'):
-                self.lookyloo.remove_pickle(uuid_to_index)
+                    # Capture unindexed
-                continue
+                    continue
-            if not indexed[0]:
+            self.indexing.index_capture(uuid, Path(d))
                self.logger.info(f'Indexing urls for {uuid_to_index}')
                self.indexing.index_url_capture(ct)
            if not indexed[1]:
                self.logger.info(f'Indexing resources for {uuid_to_index}')
                self.indexing.index_body_hashes_capture(ct)
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {uuid_to_index}')
                self.indexing.index_cookies_capture(ct)
            if not indexed[3]:
                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
                self.indexing.index_http_headers_hashes_capture(ct)
            if not indexed[4]:
                self.logger.info(f'Indexing favicons for {uuid_to_index}')
                favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
                self.indexing.index_favicons_capture(uuid_to_index, favicons)
            if not indexed[5]:
                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
                self.indexing.index_identifiers_capture(ct)
            if not indexed[6]:
                self.logger.info(f'Indexing categories for {uuid_to_index}')
                categories = self.lookyloo.categories_capture(uuid_to_index)
                self.indexing.index_categories_capture(uuid_to_index, categories)
            if not indexed[7]:
                self.logger.info(f'Indexing hash types for {uuid_to_index}')
                self.indexing.index_capture_hashes_types(ct)
        self.indexing.indexing_done()
        self.logger.info('... done.')
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@ -16,8 +16,8 @@ import time
 from collections import OrderedDict
 from collections.abc import Mapping
 from datetime import datetime
-from functools import lru_cache, _CacheInfo as CacheInfo
+from functools import _CacheInfo as CacheInfo
-from logging import Logger, LoggerAdapter
+from logging import LoggerAdapter
 from pathlib import Path
 from typing import Any, MutableMapping, Iterator
@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory  # type: ignore[attr-defined]
 from redis import Redis
 from .context import Context
-from .helpers import get_captures_dir, is_locked
+from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
 from .indexing import Indexing
 from .default import LookylooException, try_make_file, get_config
 from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
@ -106,63 +106,6 @@ class CaptureCache():
        return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
 def get_pickle_path(capture_dir: Path | str) -> Path | None:
    if isinstance(capture_dir, str):
        capture_dir = Path(capture_dir)
    pickle_file_gz = capture_dir / 'tree.pickle.gz'
    if pickle_file_gz.exists():
        return pickle_file_gz
    pickle_file = capture_dir / 'tree.pickle'
    if pickle_file.exists():
        return pickle_file
    return None
 def remove_pickle_tree(capture_dir: Path) -> None:
    pickle_path = get_pickle_path(capture_dir)
    if pickle_path and pickle_path.exists():
        pickle_path.unlink()
@lru_cache(maxsize=64)
 def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
    pickle_path = get_pickle_path(capture_dir)
    tree = None
    try:
        if pickle_path:
            if pickle_path.suffix == '.gz':
                with gzip.open(pickle_path, 'rb') as _pg:
                    tree = pickle.load(_pg)
            else:  # not a GZ pickle
                with pickle_path.open('rb') as _p:
                    tree = pickle.load(_p)
    except pickle.UnpicklingError:
        remove_pickle_tree(capture_dir)
    except EOFError:
        remove_pickle_tree(capture_dir)
    except Exception:
        logger.exception('Unexpected exception when unpickling.')
        remove_pickle_tree(capture_dir)
    if tree:
        try:
            if tree.root_hartree.har.path.exists():
                return tree
            else:
                # The capture was moved.
                remove_pickle_tree(capture_dir)
        except Exception as e:
            logger.warning(f'The pickle is broken, removing: {e}')
            remove_pickle_tree(capture_dir)
    if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
        raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
    # The tree doesn't need to be rebuilt if there are no HAR files.
    raise NoValidHarFile("Couldn't find HAR files")
 def serialize_sets(obj: Any) -> Any:
    if isinstance(obj, set):
        return list(obj)
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -3,10 +3,12 @@
 from __future__ import annotations
 import configparser
 import gzip
 import hashlib
 import json
 import logging
 import os
 import pickle
 import re
 import time
@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date
 from functools import lru_cache, cache
 from importlib.metadata import version
 from io import BufferedIOBase
 from logging import Logger
 from pathlib import Path
 from pydantic import field_validator
 from pydantic_core import from_json
-from typing import Any
+from typing import Any, TYPE_CHECKING
 from urllib.parse import urlparse
@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent
 from werkzeug.utils import cached_property
 from .default import get_homedir, safe_create_dir, get_config, LookylooException
-from .indexing import Indexing
+from .exceptions import NoValidHarFile, TreeNeedsRebuild
 # from .exceptions import InvalidCaptureSetting
 if TYPE_CHECKING:
    from .indexing import Indexing
 logger = logging.getLogger('Lookyloo - Helpers')
@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None:
@cache
 def get_indexing(full: bool=False) -> Indexing:
    from .indexing import Indexing
    if not get_config('generic', 'index_everything'):
        return Indexing()
    if full:
        return Indexing(full_index=True)
    return Indexing()
 def get_pickle_path(capture_dir: Path | str) -> Path | None:
    if isinstance(capture_dir, str):
        capture_dir = Path(capture_dir)
    pickle_file_gz = capture_dir / 'tree.pickle.gz'
    if pickle_file_gz.exists():
        return pickle_file_gz
    pickle_file = capture_dir / 'tree.pickle'
    if pickle_file.exists():
        return pickle_file
    return None
 def remove_pickle_tree(capture_dir: Path) -> None:
    pickle_path = get_pickle_path(capture_dir)
    if pickle_path and pickle_path.exists():
        pickle_path.unlink()
@lru_cache(maxsize=64)
 def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
    pickle_path = get_pickle_path(capture_dir)
    tree = None
    try:
        if pickle_path:
            if pickle_path.suffix == '.gz':
                with gzip.open(pickle_path, 'rb') as _pg:
                    tree = pickle.load(_pg)
            else:  # not a GZ pickle
                with pickle_path.open('rb') as _p:
                    tree = pickle.load(_p)
    except pickle.UnpicklingError:
        remove_pickle_tree(capture_dir)
    except EOFError:
        remove_pickle_tree(capture_dir)
    except Exception:
        logger.exception('Unexpected exception when unpickling.')
        remove_pickle_tree(capture_dir)
    if tree:
        try:
            if tree.root_hartree.har.path.exists():
                return tree
            else:
                # The capture was moved.
                remove_pickle_tree(capture_dir)
        except Exception as e:
            logger.warning(f'The pickle is broken, removing: {e}')
            remove_pickle_tree(capture_dir)
    if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
        raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
    # The tree doesn't need to be rebuilt if there are no HAR files.
    raise NoValidHarFile("Couldn't find HAR files")
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@ -15,13 +15,15 @@ import mmh3
 from bs4 import BeautifulSoup
 from hashlib import sha256
 from pathlib import Path
 from har2tree import CrawledTree
 from redis import ConnectionPool, Redis
 from redis.connection import UnixDomainSocketConnection
 from .exceptions import NoValidHarFile, TreeNeedsRebuild
 from .helpers import load_pickle_tree
 from .default import get_socket_path, get_config
 # from .helpers import get_public_suffix_list
 class Indexing():
@ -53,12 +55,17 @@ class Indexing():
    def redis(self) -> Redis:  # type: ignore[type-arg]
        return Redis(connection_pool=self.__redis_pool)
-    @property
+    def can_index(self, capture_uuid: str | None=None) -> bool:
-    def can_index(self) -> bool:
+        if capture_uuid:
            return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True))
        return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
-    def indexing_done(self) -> None:
+    def indexing_done(self, capture_uuid: str | None=None) -> None:
-        self.redis.delete('ongoing_indexing')
+        if capture_uuid:
            self.redis.delete(f'ongoing_indexing|{capture_uuid}')
        else:
            self.redis.delete('ongoing_indexing')
    def force_reindex(self, capture_uuid: str) -> None:
        p = self.redis.pipeline()
@ -91,6 +98,55 @@ class Indexing():
        # This call for sure returns a tuple of 7 booleans
        return tuple(to_return)  # type: ignore[return-value]
    def index_capture(self, uuid_to_index: str, directory: Path) -> None:
        if not self.can_index(uuid_to_index):
            self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ')
            return
        try:
            indexed = self.capture_indexed(uuid_to_index)
            if all(indexed):
                return
            if not any((directory / pickle_name).exists()
                       for pickle_name in ['tree.pickle.gz', 'tree.pickle']):
                self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ')
                return
            # do the indexing
            ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger)
            if not indexed[0]:
                self.logger.info(f'Indexing urls for {uuid_to_index}')
                self.index_url_capture(ct)
            if not indexed[1]:
                self.logger.info(f'Indexing resources for {uuid_to_index}')
                self.index_body_hashes_capture(ct)
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {uuid_to_index}')
                self.index_cookies_capture(ct)
            if not indexed[3]:
                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
                self.index_http_headers_hashes_capture(ct)
            if not indexed[4]:
                self.logger.info(f'Indexing favicons for {uuid_to_index}')
                self.index_favicons_capture(uuid_to_index, directory)
            if not indexed[5]:
                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
                self.index_identifiers_capture(ct)
            if not indexed[6]:
                self.logger.info(f'Indexing categories for {uuid_to_index}')
                self.index_categories_capture(uuid_to_index, directory)
            if not indexed[7]:
                self.logger.info(f'Indexing hash types for {uuid_to_index}')
                self.index_capture_hashes_types(ct)
        except (TreeNeedsRebuild, NoValidHarFile) as e:
            self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
        except Exception as e:
            self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
        finally:
            self.indexing_done(uuid_to_index)
    # ###### Cookies ######
    @property
@ -349,18 +405,16 @@ class Indexing():
    def favicon_number_captures(self, favicon_sha512: str) -> int:
        return self.redis.scard(f'favicons|{favicon_sha512}|captures')
-    def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
+    def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
        if self.redis.sismember('indexed_favicons', capture_uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_favicons', capture_uuid)
        self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
        pipeline = self.redis.pipeline()
-        with ZipFile(favicons, 'r') as myzip:
+        for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
-            for name in myzip.namelist():
+            with favicon_path.open('rb') as f:
-                if not name.endswith('.ico'):
+                favicon = f.read()
                    continue
                favicon = myzip.read(name)
                if not favicon:
                    # Empty file, ignore.
                    continue
@ -552,11 +606,20 @@ class Indexing():
    def categories(self) -> set[str]:
        return self.redis.smembers('categories')
-    def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
+    def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
        if self.redis.sismember('indexed_categories', capture_uuid):
            # do not reindex
            return
        # Make sure we don't reindex
        self.redis.sadd('indexed_categories', capture_uuid)
        categ_file = capture_dir / 'categories'
        if categ_file.exists():
            with categ_file.open('r') as f:
                capture_categories = [c.strip() for c in f.readlines()]
        else:
            return
        added_in_existing_categories = set()
        pipeline = self.redis.pipeline()
        for c in self.categories:
--- a/website/web/init.py
+++ b/website/web/init.py
@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse:
    return redirect(url_for('tree', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>/trigger_indexing', methods=['POST', 'GET'])
 def trigger_indexing(tree_uuid: str) -> WerkzeugResponse:
    cache = lookyloo.capture_cache(tree_uuid)
    if cache and hasattr(cache, 'capture_dir'):
        get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir)
    return redirect(url_for('tree', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
@app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET'])
 def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse:
@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
                monitoring_collections = []
                flash(f'Unable to get existing connections from the monitoring : {e}', 'warning')
        # Check if the capture has been indexed yet. Print a warning if not.
        capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid))
        if not capture_indexed:
            flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning')
        return render_template('tree.html', tree_json=ct.to_json(),
                               info=cache,
                               tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
                               confirm_message=confirm_message if confirm_message else 'Tick to confirm.',
                               parent_uuid=cache.parent,
                               has_redirects=True if cache.redirects else False,
                               capture_indexed=capture_indexed,
                               capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {})
    except NoValidHarFile:
--- a/website/web/templates/tree.html
+++ b/website/web/templates/tree.html
@ -407,6 +407,10 @@
          <div id="tools-menu" class="dropdown">
            <button class="dropbtn">Analytical Tools</button>
            <div id="tools-menu-content" class="dropdown-content">
              {% if not capture_indexed %}
                <a href="{{ url_for('trigger_indexing', tree_uuid=tree_uuid) }}" role="button" class="btn btn-warning"
                   title="The capture isn't (fully) indexed, index now.">Index capture</a>
              {% endif %}
              <a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}"
                  data-bs-toggle="modal" data-bs-target="#modulesModal" role="button"
                  title="Lookups from supported 3rd party services">Third Party Reports</a>