From 1085932ad227d2c2cc4a12f00e7c4773665550f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 29 Aug 2024 13:32:38 +0200 Subject: [PATCH] new: Force indexing of a specific capture from the UI This should also make the indexing a lot faster. --- bin/background_indexer.py | 69 +++++--------------------- lookyloo/capturecache.py | 63 ++---------------------- lookyloo/helpers.py | 68 ++++++++++++++++++++++++-- lookyloo/indexing.py | 87 ++++++++++++++++++++++++++++----- website/web/__init__.py | 14 ++++++ website/web/templates/tree.html | 4 ++ 6 files changed, 172 insertions(+), 133 deletions(-) diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 3a706816..8089f31d 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -4,14 +4,12 @@ from __future__ import annotations import logging import logging.config +from pathlib import Path from redis import Redis -from typing import Generator -from lookyloo import Lookyloo, Indexing -from lookyloo.capturecache import get_pickle_path +from lookyloo import Indexing from lookyloo.default import AbstractManager, get_config, get_socket_path -from lookyloo.exceptions import NoValidHarFile logging.config.dictConfig(get_config('logging')) @@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager): def __init__(self, full: bool=False, loglevel: int | None=None): super().__init__(loglevel) - self.lookyloo = Lookyloo(cache_max_size=1) self.is_public_instance = get_config('generic', 'public_instance') self.full_indexer = full self.indexing = Indexing(full_index=self.full_indexer) @@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager): def _to_run_forever(self) -> None: self._check_indexes() - # Don't need the cache in this class. - self.lookyloo.clear_tree_cache() - - def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]: - # NOTE: only get the non-archived captures for now. - for uuid, directory in self.redis.hscan_iter('lookup_dirs'): - if not self.full_indexer: - # If we're not running the full indexer, check if the capture should be indexed. - if self.is_public_instance and self.redis.hexists(directory, 'no_index'): - # Capture unindexed - continue - - if get_pickle_path(directory) is None: - # pickle isn't ready, we can't index. - continue - indexed = self.indexing.capture_indexed(uuid) - if all(indexed): - continue - yield indexed, uuid def _check_indexes(self) -> None: - if not self.indexing.can_index: + if not self.indexing.can_index(): # There is no reason to run this method in multiple scripts. self.logger.info('Indexing already ongoing in another process.') return None self.logger.info(f'Check {self.script_name}...') - for indexed, uuid_to_index in self._to_index_no_cache(): - try: - ct = self.lookyloo.get_crawled_tree(uuid_to_index) - except NoValidHarFile: - self.logger.warning(f'Broken pickle for {uuid_to_index}') - self.lookyloo.remove_pickle(uuid_to_index) - continue + # NOTE: only get the non-archived captures for now. + for uuid, d in self.redis.hscan_iter('lookup_dirs'): + if not self.full_indexer: + # If we're not running the full indexer, check if the capture should be indexed. + if self.is_public_instance and self.redis.hexists(d, 'no_index'): + # Capture unindexed + continue - if not indexed[0]: - self.logger.info(f'Indexing urls for {uuid_to_index}') - self.indexing.index_url_capture(ct) - if not indexed[1]: - self.logger.info(f'Indexing resources for {uuid_to_index}') - self.indexing.index_body_hashes_capture(ct) - if not indexed[2]: - self.logger.info(f'Indexing cookies for {uuid_to_index}') - self.indexing.index_cookies_capture(ct) - if not indexed[3]: - self.logger.info(f'Indexing HH Hashes for {uuid_to_index}') - self.indexing.index_http_headers_hashes_capture(ct) - if not indexed[4]: - self.logger.info(f'Indexing favicons for {uuid_to_index}') - favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False) - self.indexing.index_favicons_capture(uuid_to_index, favicons) - if not indexed[5]: - self.logger.info(f'Indexing identifiers for {uuid_to_index}') - self.indexing.index_identifiers_capture(ct) - if not indexed[6]: - self.logger.info(f'Indexing categories for {uuid_to_index}') - categories = self.lookyloo.categories_capture(uuid_to_index) - self.indexing.index_categories_capture(uuid_to_index, categories) - if not indexed[7]: - self.logger.info(f'Indexing hash types for {uuid_to_index}') - self.indexing.index_capture_hashes_types(ct) + self.indexing.index_capture(uuid, Path(d)) self.indexing.indexing_done() self.logger.info('... done.') diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index 7f257f0c..9ff76946 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -16,8 +16,8 @@ import time from collections import OrderedDict from collections.abc import Mapping from datetime import datetime -from functools import lru_cache, _CacheInfo as CacheInfo -from logging import Logger, LoggerAdapter +from functools import _CacheInfo as CacheInfo +from logging import LoggerAdapter from pathlib import Path from typing import Any, MutableMapping, Iterator @@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined] from redis import Redis from .context import Context -from .helpers import get_captures_dir, is_locked +from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree from .indexing import Indexing from .default import LookylooException, try_make_file, get_config from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild @@ -106,63 +106,6 @@ class CaptureCache(): return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger) -def get_pickle_path(capture_dir: Path | str) -> Path | None: - if isinstance(capture_dir, str): - capture_dir = Path(capture_dir) - pickle_file_gz = capture_dir / 'tree.pickle.gz' - if pickle_file_gz.exists(): - return pickle_file_gz - - pickle_file = capture_dir / 'tree.pickle' - if pickle_file.exists(): - return pickle_file - - return None - - -def remove_pickle_tree(capture_dir: Path) -> None: - pickle_path = get_pickle_path(capture_dir) - if pickle_path and pickle_path.exists(): - pickle_path.unlink() - - -@lru_cache(maxsize=64) -def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree: - pickle_path = get_pickle_path(capture_dir) - tree = None - try: - if pickle_path: - if pickle_path.suffix == '.gz': - with gzip.open(pickle_path, 'rb') as _pg: - tree = pickle.load(_pg) - else: # not a GZ pickle - with pickle_path.open('rb') as _p: - tree = pickle.load(_p) - except pickle.UnpicklingError: - remove_pickle_tree(capture_dir) - except EOFError: - remove_pickle_tree(capture_dir) - except Exception: - logger.exception('Unexpected exception when unpickling.') - remove_pickle_tree(capture_dir) - - if tree: - try: - if tree.root_hartree.har.path.exists(): - return tree - else: - # The capture was moved. - remove_pickle_tree(capture_dir) - except Exception as e: - logger.warning(f'The pickle is broken, removing: {e}') - remove_pickle_tree(capture_dir) - - if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')): - raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.') - # The tree doesn't need to be rebuilt if there are no HAR files. - raise NoValidHarFile("Couldn't find HAR files") - - def serialize_sets(obj: Any) -> Any: if isinstance(obj, set): return list(obj) diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 947c34ed..ed0a5c91 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -3,10 +3,12 @@ from __future__ import annotations import configparser +import gzip import hashlib import json import logging import os +import pickle import re import time @@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date from functools import lru_cache, cache from importlib.metadata import version from io import BufferedIOBase +from logging import Logger from pathlib import Path from pydantic import field_validator from pydantic_core import from_json -from typing import Any +from typing import Any, TYPE_CHECKING from urllib.parse import urlparse @@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent from werkzeug.utils import cached_property from .default import get_homedir, safe_create_dir, get_config, LookylooException -from .indexing import Indexing -# from .exceptions import InvalidCaptureSetting +from .exceptions import NoValidHarFile, TreeNeedsRebuild +if TYPE_CHECKING: + from .indexing import Indexing logger = logging.getLogger('Lookyloo - Helpers') @@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None: @cache def get_indexing(full: bool=False) -> Indexing: + from .indexing import Indexing if not get_config('generic', 'index_everything'): return Indexing() if full: return Indexing(full_index=True) return Indexing() + + +def get_pickle_path(capture_dir: Path | str) -> Path | None: + if isinstance(capture_dir, str): + capture_dir = Path(capture_dir) + pickle_file_gz = capture_dir / 'tree.pickle.gz' + if pickle_file_gz.exists(): + return pickle_file_gz + + pickle_file = capture_dir / 'tree.pickle' + if pickle_file.exists(): + return pickle_file + + return None + + +def remove_pickle_tree(capture_dir: Path) -> None: + pickle_path = get_pickle_path(capture_dir) + if pickle_path and pickle_path.exists(): + pickle_path.unlink() + + +@lru_cache(maxsize=64) +def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree: + pickle_path = get_pickle_path(capture_dir) + tree = None + try: + if pickle_path: + if pickle_path.suffix == '.gz': + with gzip.open(pickle_path, 'rb') as _pg: + tree = pickle.load(_pg) + else: # not a GZ pickle + with pickle_path.open('rb') as _p: + tree = pickle.load(_p) + except pickle.UnpicklingError: + remove_pickle_tree(capture_dir) + except EOFError: + remove_pickle_tree(capture_dir) + except Exception: + logger.exception('Unexpected exception when unpickling.') + remove_pickle_tree(capture_dir) + + if tree: + try: + if tree.root_hartree.har.path.exists(): + return tree + else: + # The capture was moved. + remove_pickle_tree(capture_dir) + except Exception as e: + logger.warning(f'The pickle is broken, removing: {e}') + remove_pickle_tree(capture_dir) + + if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')): + raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.') + # The tree doesn't need to be rebuilt if there are no HAR files. + raise NoValidHarFile("Couldn't find HAR files") diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index a81ad0bc..1465a069 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -15,13 +15,15 @@ import mmh3 from bs4 import BeautifulSoup from hashlib import sha256 +from pathlib import Path from har2tree import CrawledTree from redis import ConnectionPool, Redis from redis.connection import UnixDomainSocketConnection +from .exceptions import NoValidHarFile, TreeNeedsRebuild +from .helpers import load_pickle_tree from .default import get_socket_path, get_config -# from .helpers import get_public_suffix_list class Indexing(): @@ -53,12 +55,17 @@ class Indexing(): def redis(self) -> Redis: # type: ignore[type-arg] return Redis(connection_pool=self.__redis_pool) - @property - def can_index(self) -> bool: + def can_index(self, capture_uuid: str | None=None) -> bool: + if capture_uuid: + return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True)) + return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True)) - def indexing_done(self) -> None: - self.redis.delete('ongoing_indexing') + def indexing_done(self, capture_uuid: str | None=None) -> None: + if capture_uuid: + self.redis.delete(f'ongoing_indexing|{capture_uuid}') + else: + self.redis.delete('ongoing_indexing') def force_reindex(self, capture_uuid: str) -> None: p = self.redis.pipeline() @@ -91,6 +98,55 @@ class Indexing(): # This call for sure returns a tuple of 7 booleans return tuple(to_return) # type: ignore[return-value] + def index_capture(self, uuid_to_index: str, directory: Path) -> None: + if not self.can_index(uuid_to_index): + self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ') + return + + try: + indexed = self.capture_indexed(uuid_to_index) + if all(indexed): + return + + if not any((directory / pickle_name).exists() + for pickle_name in ['tree.pickle.gz', 'tree.pickle']): + self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ') + return + + # do the indexing + ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger) + if not indexed[0]: + self.logger.info(f'Indexing urls for {uuid_to_index}') + self.index_url_capture(ct) + if not indexed[1]: + self.logger.info(f'Indexing resources for {uuid_to_index}') + self.index_body_hashes_capture(ct) + if not indexed[2]: + self.logger.info(f'Indexing cookies for {uuid_to_index}') + self.index_cookies_capture(ct) + if not indexed[3]: + self.logger.info(f'Indexing HH Hashes for {uuid_to_index}') + self.index_http_headers_hashes_capture(ct) + if not indexed[4]: + self.logger.info(f'Indexing favicons for {uuid_to_index}') + self.index_favicons_capture(uuid_to_index, directory) + if not indexed[5]: + self.logger.info(f'Indexing identifiers for {uuid_to_index}') + self.index_identifiers_capture(ct) + if not indexed[6]: + self.logger.info(f'Indexing categories for {uuid_to_index}') + self.index_categories_capture(uuid_to_index, directory) + if not indexed[7]: + self.logger.info(f'Indexing hash types for {uuid_to_index}') + self.index_capture_hashes_types(ct) + + except (TreeNeedsRebuild, NoValidHarFile) as e: + self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}') + except Exception as e: + self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}') + finally: + self.indexing_done(uuid_to_index) + # ###### Cookies ###### @property @@ -349,18 +405,16 @@ class Indexing(): def favicon_number_captures(self, favicon_sha512: str) -> int: return self.redis.scard(f'favicons|{favicon_sha512}|captures') - def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None: + def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None: if self.redis.sismember('indexed_favicons', capture_uuid): # Do not reindex return self.redis.sadd('indexed_favicons', capture_uuid) self.logger.debug(f'Indexing favicons for {capture_uuid} ... ') pipeline = self.redis.pipeline() - with ZipFile(favicons, 'r') as myzip: - for name in myzip.namelist(): - if not name.endswith('.ico'): - continue - favicon = myzip.read(name) + for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))): + with favicon_path.open('rb') as f: + favicon = f.read() if not favicon: # Empty file, ignore. continue @@ -552,11 +606,20 @@ class Indexing(): def categories(self) -> set[str]: return self.redis.smembers('categories') - def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None: + def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None: if self.redis.sismember('indexed_categories', capture_uuid): # do not reindex return + # Make sure we don't reindex self.redis.sadd('indexed_categories', capture_uuid) + + categ_file = capture_dir / 'categories' + if categ_file.exists(): + with categ_file.open('r') as f: + capture_categories = [c.strip() for c in f.readlines()] + else: + return + added_in_existing_categories = set() pipeline = self.redis.pipeline() for c in self.categories: diff --git a/website/web/__init__.py b/website/web/__init__.py index 188c845a..57ed6174 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse: return redirect(url_for('tree', tree_uuid=tree_uuid)) +@app.route('/tree//trigger_indexing', methods=['POST', 'GET']) +def trigger_indexing(tree_uuid: str) -> WerkzeugResponse: + cache = lookyloo.capture_cache(tree_uuid) + if cache and hasattr(cache, 'capture_dir'): + get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir) + return redirect(url_for('tree', tree_uuid=tree_uuid)) + + @app.route('/tree/', methods=['GET']) @app.route('/tree//', methods=['GET']) def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse: @@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu monitoring_collections = [] flash(f'Unable to get existing connections from the monitoring : {e}', 'warning') + # Check if the capture has been indexed yet. Print a warning if not. + capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid)) + if not capture_indexed: + flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning') + return render_template('tree.html', tree_json=ct.to_json(), info=cache, tree_uuid=tree_uuid, public_domain=lookyloo.public_domain, @@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu confirm_message=confirm_message if confirm_message else 'Tick to confirm.', parent_uuid=cache.parent, has_redirects=True if cache.redirects else False, + capture_indexed=capture_indexed, capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {}) except NoValidHarFile: diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html index 1a0a22d7..cbc3db44 100644 --- a/website/web/templates/tree.html +++ b/website/web/templates/tree.html @@ -407,6 +407,10 @@