new: Force indexing of a specific capture from the UI

This should also make the indexing a lot faster.
2024-08-29 13:32:38 +02:00 · 2024-08-29 13:32:38 +02:00 · 1085932ad2
parent 28e81a1eae
commit 1085932ad2
6 changed files with 172 additions and 133 deletions
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@ -4,14 +4,12 @@ from __future__ import annotations

 import logging
 import logging.config
+from pathlib import Path

 from redis import Redis
-from typing import Generator

-from lookyloo import Lookyloo, Indexing
-from lookyloo.capturecache import get_pickle_path
+from lookyloo import Indexing
 from lookyloo.default import AbstractManager, get_config, get_socket_path
-from lookyloo.exceptions import NoValidHarFile


 logging.config.dictConfig(get_config('logging'))
@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager):

    def __init__(self, full: bool=False, loglevel: int | None=None):
        super().__init__(loglevel)
-        self.lookyloo = Lookyloo(cache_max_size=1)
        self.is_public_instance = get_config('generic', 'public_instance')
        self.full_indexer = full
        self.indexing = Indexing(full_index=self.full_indexer)
@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager):

    def _to_run_forever(self) -> None:
        self._check_indexes()
-        # Don't need the cache in this class.
-        self.lookyloo.clear_tree_cache()
-
-    def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
-        # NOTE: only get the non-archived captures for now.
-        for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
-            if not self.full_indexer:
-                # If we're not running the full indexer, check if the capture should be indexed.
-                if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
-                    # Capture unindexed
-                    continue
-
-            if get_pickle_path(directory) is None:
-                # pickle isn't ready, we can't index.
-                continue
-            indexed = self.indexing.capture_indexed(uuid)
-            if all(indexed):
-                continue
-            yield indexed, uuid

    def _check_indexes(self) -> None:
-        if not self.indexing.can_index:
+        if not self.indexing.can_index():
            # There is no reason to run this method in multiple scripts.
            self.logger.info('Indexing already ongoing in another process.')
            return None
        self.logger.info(f'Check {self.script_name}...')
-        for indexed, uuid_to_index in self._to_index_no_cache():
-            try:
-                ct = self.lookyloo.get_crawled_tree(uuid_to_index)
-            except NoValidHarFile:
-                self.logger.warning(f'Broken pickle for {uuid_to_index}')
-                self.lookyloo.remove_pickle(uuid_to_index)
-                continue
+        # NOTE: only get the non-archived captures for now.
+        for uuid, d in self.redis.hscan_iter('lookup_dirs'):
+            if not self.full_indexer:
+                # If we're not running the full indexer, check if the capture should be indexed.
+                if self.is_public_instance and self.redis.hexists(d, 'no_index'):
+                    # Capture unindexed
+                    continue

-            if not indexed[0]:
-                self.logger.info(f'Indexing urls for {uuid_to_index}')
-                self.indexing.index_url_capture(ct)
-            if not indexed[1]:
-                self.logger.info(f'Indexing resources for {uuid_to_index}')
-                self.indexing.index_body_hashes_capture(ct)
-            if not indexed[2]:
-                self.logger.info(f'Indexing cookies for {uuid_to_index}')
-                self.indexing.index_cookies_capture(ct)
-            if not indexed[3]:
-                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
-                self.indexing.index_http_headers_hashes_capture(ct)
-            if not indexed[4]:
-                self.logger.info(f'Indexing favicons for {uuid_to_index}')
-                favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
-                self.indexing.index_favicons_capture(uuid_to_index, favicons)
-            if not indexed[5]:
-                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
-                self.indexing.index_identifiers_capture(ct)
-            if not indexed[6]:
-                self.logger.info(f'Indexing categories for {uuid_to_index}')
-                categories = self.lookyloo.categories_capture(uuid_to_index)
-                self.indexing.index_categories_capture(uuid_to_index, categories)
-            if not indexed[7]:
-                self.logger.info(f'Indexing hash types for {uuid_to_index}')
-                self.indexing.index_capture_hashes_types(ct)
+            self.indexing.index_capture(uuid, Path(d))
        self.indexing.indexing_done()
        self.logger.info('... done.')

--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@ -16,8 +16,8 @@ import time
 from collections import OrderedDict
 from collections.abc import Mapping
 from datetime import datetime
-from functools import lru_cache, _CacheInfo as CacheInfo
-from logging import Logger, LoggerAdapter
+from functools import _CacheInfo as CacheInfo
+from logging import LoggerAdapter
 from pathlib import Path
 from typing import Any, MutableMapping, Iterator

@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory  # type: ignore[attr-defined]
 from redis import Redis

 from .context import Context
-from .helpers import get_captures_dir, is_locked
+from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
 from .indexing import Indexing
 from .default import LookylooException, try_make_file, get_config
 from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
@ -106,63 +106,6 @@ class CaptureCache():
        return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)


-def get_pickle_path(capture_dir: Path | str) -> Path | None:
-    if isinstance(capture_dir, str):
-        capture_dir = Path(capture_dir)
-    pickle_file_gz = capture_dir / 'tree.pickle.gz'
-    if pickle_file_gz.exists():
-        return pickle_file_gz
-
-    pickle_file = capture_dir / 'tree.pickle'
-    if pickle_file.exists():
-        return pickle_file
-
-    return None
-
-
-def remove_pickle_tree(capture_dir: Path) -> None:
-    pickle_path = get_pickle_path(capture_dir)
-    if pickle_path and pickle_path.exists():
-        pickle_path.unlink()
-
-
-@lru_cache(maxsize=64)
-def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
-    pickle_path = get_pickle_path(capture_dir)
-    tree = None
-    try:
-        if pickle_path:
-            if pickle_path.suffix == '.gz':
-                with gzip.open(pickle_path, 'rb') as _pg:
-                    tree = pickle.load(_pg)
-            else:  # not a GZ pickle
-                with pickle_path.open('rb') as _p:
-                    tree = pickle.load(_p)
-    except pickle.UnpicklingError:
-        remove_pickle_tree(capture_dir)
-    except EOFError:
-        remove_pickle_tree(capture_dir)
-    except Exception:
-        logger.exception('Unexpected exception when unpickling.')
-        remove_pickle_tree(capture_dir)
-
-    if tree:
-        try:
-            if tree.root_hartree.har.path.exists():
-                return tree
-            else:
-                # The capture was moved.
-                remove_pickle_tree(capture_dir)
-        except Exception as e:
-            logger.warning(f'The pickle is broken, removing: {e}')
-            remove_pickle_tree(capture_dir)
-
-    if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
-        raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
-    # The tree doesn't need to be rebuilt if there are no HAR files.
-    raise NoValidHarFile("Couldn't find HAR files")
-
-
 def serialize_sets(obj: Any) -> Any:
    if isinstance(obj, set):
        return list(obj)
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -3,10 +3,12 @@
 from __future__ import annotations

 import configparser
+import gzip
 import hashlib
 import json
 import logging
 import os
+import pickle
 import re
 import time

@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date
 from functools import lru_cache, cache
 from importlib.metadata import version
 from io import BufferedIOBase
+from logging import Logger
 from pathlib import Path
 from pydantic import field_validator
 from pydantic_core import from_json
-from typing import Any
+from typing import Any, TYPE_CHECKING
 from urllib.parse import urlparse


@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent
 from werkzeug.utils import cached_property

 from .default import get_homedir, safe_create_dir, get_config, LookylooException
-from .indexing import Indexing
-# from .exceptions import InvalidCaptureSetting
+from .exceptions import NoValidHarFile, TreeNeedsRebuild

+if TYPE_CHECKING:
+    from .indexing import Indexing

 logger = logging.getLogger('Lookyloo - Helpers')

@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None:

@cache
 def get_indexing(full: bool=False) -> Indexing:
+    from .indexing import Indexing
    if not get_config('generic', 'index_everything'):
        return Indexing()
    if full:
        return Indexing(full_index=True)
    return Indexing()
+
+
+def get_pickle_path(capture_dir: Path | str) -> Path | None:
+    if isinstance(capture_dir, str):
+        capture_dir = Path(capture_dir)
+    pickle_file_gz = capture_dir / 'tree.pickle.gz'
+    if pickle_file_gz.exists():
+        return pickle_file_gz
+
+    pickle_file = capture_dir / 'tree.pickle'
+    if pickle_file.exists():
+        return pickle_file
+
+    return None
+
+
+def remove_pickle_tree(capture_dir: Path) -> None:
+    pickle_path = get_pickle_path(capture_dir)
+    if pickle_path and pickle_path.exists():
+        pickle_path.unlink()
+
+
+@lru_cache(maxsize=64)
+def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
+    pickle_path = get_pickle_path(capture_dir)
+    tree = None
+    try:
+        if pickle_path:
+            if pickle_path.suffix == '.gz':
+                with gzip.open(pickle_path, 'rb') as _pg:
+                    tree = pickle.load(_pg)
+            else:  # not a GZ pickle
+                with pickle_path.open('rb') as _p:
+                    tree = pickle.load(_p)
+    except pickle.UnpicklingError:
+        remove_pickle_tree(capture_dir)
+    except EOFError:
+        remove_pickle_tree(capture_dir)
+    except Exception:
+        logger.exception('Unexpected exception when unpickling.')
+        remove_pickle_tree(capture_dir)
+
+    if tree:
+        try:
+            if tree.root_hartree.har.path.exists():
+                return tree
+            else:
+                # The capture was moved.
+                remove_pickle_tree(capture_dir)
+        except Exception as e:
+            logger.warning(f'The pickle is broken, removing: {e}')
+            remove_pickle_tree(capture_dir)
+
+    if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
+        raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
+    # The tree doesn't need to be rebuilt if there are no HAR files.
+    raise NoValidHarFile("Couldn't find HAR files")
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@ -15,13 +15,15 @@ import mmh3

 from bs4 import BeautifulSoup
 from hashlib import sha256
+from pathlib import Path

 from har2tree import CrawledTree
 from redis import ConnectionPool, Redis
 from redis.connection import UnixDomainSocketConnection

+from .exceptions import NoValidHarFile, TreeNeedsRebuild
+from .helpers import load_pickle_tree
 from .default import get_socket_path, get_config
-# from .helpers import get_public_suffix_list


 class Indexing():
@ -53,12 +55,17 @@ class Indexing():
    def redis(self) -> Redis:  # type: ignore[type-arg]
        return Redis(connection_pool=self.__redis_pool)

-    @property
-    def can_index(self) -> bool:
+    def can_index(self, capture_uuid: str | None=None) -> bool:
+        if capture_uuid:
+            return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True))
+
        return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))

-    def indexing_done(self) -> None:
-        self.redis.delete('ongoing_indexing')
+    def indexing_done(self, capture_uuid: str | None=None) -> None:
+        if capture_uuid:
+            self.redis.delete(f'ongoing_indexing|{capture_uuid}')
+        else:
+            self.redis.delete('ongoing_indexing')

    def force_reindex(self, capture_uuid: str) -> None:
        p = self.redis.pipeline()
@ -91,6 +98,55 @@ class Indexing():
        # This call for sure returns a tuple of 7 booleans
        return tuple(to_return)  # type: ignore[return-value]

+    def index_capture(self, uuid_to_index: str, directory: Path) -> None:
+        if not self.can_index(uuid_to_index):
+            self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ')
+            return
+
+        try:
+            indexed = self.capture_indexed(uuid_to_index)
+            if all(indexed):
+                return
+
+            if not any((directory / pickle_name).exists()
+                       for pickle_name in ['tree.pickle.gz', 'tree.pickle']):
+                self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ')
+                return
+
+            # do the indexing
+            ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger)
+            if not indexed[0]:
+                self.logger.info(f'Indexing urls for {uuid_to_index}')
+                self.index_url_capture(ct)
+            if not indexed[1]:
+                self.logger.info(f'Indexing resources for {uuid_to_index}')
+                self.index_body_hashes_capture(ct)
+            if not indexed[2]:
+                self.logger.info(f'Indexing cookies for {uuid_to_index}')
+                self.index_cookies_capture(ct)
+            if not indexed[3]:
+                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
+                self.index_http_headers_hashes_capture(ct)
+            if not indexed[4]:
+                self.logger.info(f'Indexing favicons for {uuid_to_index}')
+                self.index_favicons_capture(uuid_to_index, directory)
+            if not indexed[5]:
+                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
+                self.index_identifiers_capture(ct)
+            if not indexed[6]:
+                self.logger.info(f'Indexing categories for {uuid_to_index}')
+                self.index_categories_capture(uuid_to_index, directory)
+            if not indexed[7]:
+                self.logger.info(f'Indexing hash types for {uuid_to_index}')
+                self.index_capture_hashes_types(ct)
+
+        except (TreeNeedsRebuild, NoValidHarFile) as e:
+            self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
+        except Exception as e:
+            self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
+        finally:
+            self.indexing_done(uuid_to_index)
+
    # ###### Cookies ######

    @property
@ -349,18 +405,16 @@ class Indexing():
    def favicon_number_captures(self, favicon_sha512: str) -> int:
        return self.redis.scard(f'favicons|{favicon_sha512}|captures')

-    def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
+    def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
        if self.redis.sismember('indexed_favicons', capture_uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_favicons', capture_uuid)
        self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
        pipeline = self.redis.pipeline()
-        with ZipFile(favicons, 'r') as myzip:
-            for name in myzip.namelist():
-                if not name.endswith('.ico'):
-                    continue
-                favicon = myzip.read(name)
+        for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
+            with favicon_path.open('rb') as f:
+                favicon = f.read()
                if not favicon:
                    # Empty file, ignore.
                    continue
@ -552,11 +606,20 @@ class Indexing():
    def categories(self) -> set[str]:
        return self.redis.smembers('categories')

-    def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
+    def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
        if self.redis.sismember('indexed_categories', capture_uuid):
            # do not reindex
            return
+        # Make sure we don't reindex
        self.redis.sadd('indexed_categories', capture_uuid)
+
+        categ_file = capture_dir / 'categories'
+        if categ_file.exists():
+            with categ_file.open('r') as f:
+                capture_categories = [c.strip() for c in f.readlines()]
+        else:
+            return
+
        added_in_existing_categories = set()
        pipeline = self.redis.pipeline()
        for c in self.categories:
--- a/website/web/init.py
+++ b/website/web/init.py
@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse:
    return redirect(url_for('tree', tree_uuid=tree_uuid))


+@app.route('/tree/<string:tree_uuid>/trigger_indexing', methods=['POST', 'GET'])
+def trigger_indexing(tree_uuid: str) -> WerkzeugResponse:
+    cache = lookyloo.capture_cache(tree_uuid)
+    if cache and hasattr(cache, 'capture_dir'):
+        get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir)
+    return redirect(url_for('tree', tree_uuid=tree_uuid))
+
+
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
@app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET'])
 def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse:
@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
                monitoring_collections = []
                flash(f'Unable to get existing connections from the monitoring : {e}', 'warning')

+        # Check if the capture has been indexed yet. Print a warning if not.
+        capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid))
+        if not capture_indexed:
+            flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning')
+
        return render_template('tree.html', tree_json=ct.to_json(),
                               info=cache,
                               tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
                               confirm_message=confirm_message if confirm_message else 'Tick to confirm.',
                               parent_uuid=cache.parent,
                               has_redirects=True if cache.redirects else False,
+                               capture_indexed=capture_indexed,
                               capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {})

    except NoValidHarFile:
--- a/website/web/templates/tree.html
+++ b/website/web/templates/tree.html
@ -407,6 +407,10 @@
          <div id="tools-menu" class="dropdown">
            <button class="dropbtn">Analytical Tools</button>
            <div id="tools-menu-content" class="dropdown-content">
+              {% if not capture_indexed %}
+                <a href="{{ url_for('trigger_indexing', tree_uuid=tree_uuid) }}" role="button" class="btn btn-warning"
+                   title="The capture isn't (fully) indexed, index now.">Index capture</a>
+              {% endif %}
              <a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}"
                  data-bs-toggle="modal" data-bs-target="#modulesModal" role="button"
                  title="Lookups from supported 3rd party services">Third Party Reports</a>