new: Force indexing of a specific capture from the UI

This should also make the indexing a lot faster.
pull/937/head
Raphaël Vinot 2024-08-29 13:32:38 +02:00
parent 28e81a1eae
commit 1085932ad2
6 changed files with 172 additions and 133 deletions

View File

@ -4,14 +4,12 @@ from __future__ import annotations
import logging import logging
import logging.config import logging.config
from pathlib import Path
from redis import Redis from redis import Redis
from typing import Generator
from lookyloo import Lookyloo, Indexing from lookyloo import Indexing
from lookyloo.capturecache import get_pickle_path
from lookyloo.default import AbstractManager, get_config, get_socket_path from lookyloo.default import AbstractManager, get_config, get_socket_path
from lookyloo.exceptions import NoValidHarFile
logging.config.dictConfig(get_config('logging')) logging.config.dictConfig(get_config('logging'))
@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager):
def __init__(self, full: bool=False, loglevel: int | None=None): def __init__(self, full: bool=False, loglevel: int | None=None):
super().__init__(loglevel) super().__init__(loglevel)
self.lookyloo = Lookyloo(cache_max_size=1)
self.is_public_instance = get_config('generic', 'public_instance') self.is_public_instance = get_config('generic', 'public_instance')
self.full_indexer = full self.full_indexer = full
self.indexing = Indexing(full_index=self.full_indexer) self.indexing = Indexing(full_index=self.full_indexer)
@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager):
def _to_run_forever(self) -> None: def _to_run_forever(self) -> None:
self._check_indexes() self._check_indexes()
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
# Capture unindexed
continue
if get_pickle_path(directory) is None:
# pickle isn't ready, we can't index.
continue
indexed = self.indexing.capture_indexed(uuid)
if all(indexed):
continue
yield indexed, uuid
def _check_indexes(self) -> None: def _check_indexes(self) -> None:
if not self.indexing.can_index: if not self.indexing.can_index():
# There is no reason to run this method in multiple scripts. # There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.') self.logger.info('Indexing already ongoing in another process.')
return None return None
self.logger.info(f'Check {self.script_name}...') self.logger.info(f'Check {self.script_name}...')
for indexed, uuid_to_index in self._to_index_no_cache(): # NOTE: only get the non-archived captures for now.
try: for uuid, d in self.redis.hscan_iter('lookup_dirs'):
ct = self.lookyloo.get_crawled_tree(uuid_to_index) if not self.full_indexer:
except NoValidHarFile: # If we're not running the full indexer, check if the capture should be indexed.
self.logger.warning(f'Broken pickle for {uuid_to_index}') if self.is_public_instance and self.redis.hexists(d, 'no_index'):
self.lookyloo.remove_pickle(uuid_to_index) # Capture unindexed
continue continue
if not indexed[0]: self.indexing.index_capture(uuid, Path(d))
self.logger.info(f'Indexing urls for {uuid_to_index}')
self.indexing.index_url_capture(ct)
if not indexed[1]:
self.logger.info(f'Indexing resources for {uuid_to_index}')
self.indexing.index_body_hashes_capture(ct)
if not indexed[2]:
self.logger.info(f'Indexing cookies for {uuid_to_index}')
self.indexing.index_cookies_capture(ct)
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
self.indexing.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {uuid_to_index}')
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
self.indexing.index_favicons_capture(uuid_to_index, favicons)
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
if not indexed[6]:
self.logger.info(f'Indexing categories for {uuid_to_index}')
categories = self.lookyloo.categories_capture(uuid_to_index)
self.indexing.index_categories_capture(uuid_to_index, categories)
if not indexed[7]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.indexing.index_capture_hashes_types(ct)
self.indexing.indexing_done() self.indexing.indexing_done()
self.logger.info('... done.') self.logger.info('... done.')

View File

@ -16,8 +16,8 @@ import time
from collections import OrderedDict from collections import OrderedDict
from collections.abc import Mapping from collections.abc import Mapping
from datetime import datetime from datetime import datetime
from functools import lru_cache, _CacheInfo as CacheInfo from functools import _CacheInfo as CacheInfo
from logging import Logger, LoggerAdapter from logging import LoggerAdapter
from pathlib import Path from pathlib import Path
from typing import Any, MutableMapping, Iterator from typing import Any, MutableMapping, Iterator
@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
from redis import Redis from redis import Redis
from .context import Context from .context import Context
from .helpers import get_captures_dir, is_locked from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
from .indexing import Indexing from .indexing import Indexing
from .default import LookylooException, try_make_file, get_config from .default import LookylooException, try_make_file, get_config
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
@ -106,63 +106,6 @@ class CaptureCache():
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger) return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
def get_pickle_path(capture_dir: Path | str) -> Path | None:
if isinstance(capture_dir, str):
capture_dir = Path(capture_dir)
pickle_file_gz = capture_dir / 'tree.pickle.gz'
if pickle_file_gz.exists():
return pickle_file_gz
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
return pickle_file
return None
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_path = get_pickle_path(capture_dir)
if pickle_path and pickle_path.exists():
pickle_path.unlink()
@lru_cache(maxsize=64)
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
pickle_path = get_pickle_path(capture_dir)
tree = None
try:
if pickle_path:
if pickle_path.suffix == '.gz':
with gzip.open(pickle_path, 'rb') as _pg:
tree = pickle.load(_pg)
else: # not a GZ pickle
with pickle_path.open('rb') as _p:
tree = pickle.load(_p)
except pickle.UnpicklingError:
remove_pickle_tree(capture_dir)
except EOFError:
remove_pickle_tree(capture_dir)
except Exception:
logger.exception('Unexpected exception when unpickling.')
remove_pickle_tree(capture_dir)
if tree:
try:
if tree.root_hartree.har.path.exists():
return tree
else:
# The capture was moved.
remove_pickle_tree(capture_dir)
except Exception as e:
logger.warning(f'The pickle is broken, removing: {e}')
remove_pickle_tree(capture_dir)
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
# The tree doesn't need to be rebuilt if there are no HAR files.
raise NoValidHarFile("Couldn't find HAR files")
def serialize_sets(obj: Any) -> Any: def serialize_sets(obj: Any) -> Any:
if isinstance(obj, set): if isinstance(obj, set):
return list(obj) return list(obj)

View File

@ -3,10 +3,12 @@
from __future__ import annotations from __future__ import annotations
import configparser import configparser
import gzip
import hashlib import hashlib
import json import json
import logging import logging
import os import os
import pickle
import re import re
import time import time
@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date
from functools import lru_cache, cache from functools import lru_cache, cache
from importlib.metadata import version from importlib.metadata import version
from io import BufferedIOBase from io import BufferedIOBase
from logging import Logger
from pathlib import Path from pathlib import Path
from pydantic import field_validator from pydantic import field_validator
from pydantic_core import from_json from pydantic_core import from_json
from typing import Any from typing import Any, TYPE_CHECKING
from urllib.parse import urlparse from urllib.parse import urlparse
@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent
from werkzeug.utils import cached_property from werkzeug.utils import cached_property
from .default import get_homedir, safe_create_dir, get_config, LookylooException from .default import get_homedir, safe_create_dir, get_config, LookylooException
from .indexing import Indexing from .exceptions import NoValidHarFile, TreeNeedsRebuild
# from .exceptions import InvalidCaptureSetting
if TYPE_CHECKING:
from .indexing import Indexing
logger = logging.getLogger('Lookyloo - Helpers') logger = logging.getLogger('Lookyloo - Helpers')
@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None:
@cache @cache
def get_indexing(full: bool=False) -> Indexing: def get_indexing(full: bool=False) -> Indexing:
from .indexing import Indexing
if not get_config('generic', 'index_everything'): if not get_config('generic', 'index_everything'):
return Indexing() return Indexing()
if full: if full:
return Indexing(full_index=True) return Indexing(full_index=True)
return Indexing() return Indexing()
def get_pickle_path(capture_dir: Path | str) -> Path | None:
if isinstance(capture_dir, str):
capture_dir = Path(capture_dir)
pickle_file_gz = capture_dir / 'tree.pickle.gz'
if pickle_file_gz.exists():
return pickle_file_gz
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
return pickle_file
return None
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_path = get_pickle_path(capture_dir)
if pickle_path and pickle_path.exists():
pickle_path.unlink()
@lru_cache(maxsize=64)
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
pickle_path = get_pickle_path(capture_dir)
tree = None
try:
if pickle_path:
if pickle_path.suffix == '.gz':
with gzip.open(pickle_path, 'rb') as _pg:
tree = pickle.load(_pg)
else: # not a GZ pickle
with pickle_path.open('rb') as _p:
tree = pickle.load(_p)
except pickle.UnpicklingError:
remove_pickle_tree(capture_dir)
except EOFError:
remove_pickle_tree(capture_dir)
except Exception:
logger.exception('Unexpected exception when unpickling.')
remove_pickle_tree(capture_dir)
if tree:
try:
if tree.root_hartree.har.path.exists():
return tree
else:
# The capture was moved.
remove_pickle_tree(capture_dir)
except Exception as e:
logger.warning(f'The pickle is broken, removing: {e}')
remove_pickle_tree(capture_dir)
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
# The tree doesn't need to be rebuilt if there are no HAR files.
raise NoValidHarFile("Couldn't find HAR files")

View File

@ -15,13 +15,15 @@ import mmh3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from hashlib import sha256 from hashlib import sha256
from pathlib import Path
from har2tree import CrawledTree from har2tree import CrawledTree
from redis import ConnectionPool, Redis from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection from redis.connection import UnixDomainSocketConnection
from .exceptions import NoValidHarFile, TreeNeedsRebuild
from .helpers import load_pickle_tree
from .default import get_socket_path, get_config from .default import get_socket_path, get_config
# from .helpers import get_public_suffix_list
class Indexing(): class Indexing():
@ -53,11 +55,16 @@ class Indexing():
def redis(self) -> Redis: # type: ignore[type-arg] def redis(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.__redis_pool) return Redis(connection_pool=self.__redis_pool)
@property def can_index(self, capture_uuid: str | None=None) -> bool:
def can_index(self) -> bool: if capture_uuid:
return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True))
return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True)) return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
def indexing_done(self) -> None: def indexing_done(self, capture_uuid: str | None=None) -> None:
if capture_uuid:
self.redis.delete(f'ongoing_indexing|{capture_uuid}')
else:
self.redis.delete('ongoing_indexing') self.redis.delete('ongoing_indexing')
def force_reindex(self, capture_uuid: str) -> None: def force_reindex(self, capture_uuid: str) -> None:
@ -91,6 +98,55 @@ class Indexing():
# This call for sure returns a tuple of 7 booleans # This call for sure returns a tuple of 7 booleans
return tuple(to_return) # type: ignore[return-value] return tuple(to_return) # type: ignore[return-value]
def index_capture(self, uuid_to_index: str, directory: Path) -> None:
if not self.can_index(uuid_to_index):
self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ')
return
try:
indexed = self.capture_indexed(uuid_to_index)
if all(indexed):
return
if not any((directory / pickle_name).exists()
for pickle_name in ['tree.pickle.gz', 'tree.pickle']):
self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ')
return
# do the indexing
ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger)
if not indexed[0]:
self.logger.info(f'Indexing urls for {uuid_to_index}')
self.index_url_capture(ct)
if not indexed[1]:
self.logger.info(f'Indexing resources for {uuid_to_index}')
self.index_body_hashes_capture(ct)
if not indexed[2]:
self.logger.info(f'Indexing cookies for {uuid_to_index}')
self.index_cookies_capture(ct)
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
self.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {uuid_to_index}')
self.index_favicons_capture(uuid_to_index, directory)
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.index_identifiers_capture(ct)
if not indexed[6]:
self.logger.info(f'Indexing categories for {uuid_to_index}')
self.index_categories_capture(uuid_to_index, directory)
if not indexed[7]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.index_capture_hashes_types(ct)
except (TreeNeedsRebuild, NoValidHarFile) as e:
self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
except Exception as e:
self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
finally:
self.indexing_done(uuid_to_index)
# ###### Cookies ###### # ###### Cookies ######
@property @property
@ -349,18 +405,16 @@ class Indexing():
def favicon_number_captures(self, favicon_sha512: str) -> int: def favicon_number_captures(self, favicon_sha512: str) -> int:
return self.redis.scard(f'favicons|{favicon_sha512}|captures') return self.redis.scard(f'favicons|{favicon_sha512}|captures')
def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None: def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
if self.redis.sismember('indexed_favicons', capture_uuid): if self.redis.sismember('indexed_favicons', capture_uuid):
# Do not reindex # Do not reindex
return return
self.redis.sadd('indexed_favicons', capture_uuid) self.redis.sadd('indexed_favicons', capture_uuid)
self.logger.debug(f'Indexing favicons for {capture_uuid} ... ') self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
pipeline = self.redis.pipeline() pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip: for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
for name in myzip.namelist(): with favicon_path.open('rb') as f:
if not name.endswith('.ico'): favicon = f.read()
continue
favicon = myzip.read(name)
if not favicon: if not favicon:
# Empty file, ignore. # Empty file, ignore.
continue continue
@ -552,11 +606,20 @@ class Indexing():
def categories(self) -> set[str]: def categories(self) -> set[str]:
return self.redis.smembers('categories') return self.redis.smembers('categories')
def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None: def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
if self.redis.sismember('indexed_categories', capture_uuid): if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex # do not reindex
return return
# Make sure we don't reindex
self.redis.sadd('indexed_categories', capture_uuid) self.redis.sadd('indexed_categories', capture_uuid)
categ_file = capture_dir / 'categories'
if categ_file.exists():
with categ_file.open('r') as f:
capture_categories = [c.strip() for c in f.readlines()]
else:
return
added_in_existing_categories = set() added_in_existing_categories = set()
pipeline = self.redis.pipeline() pipeline = self.redis.pipeline()
for c in self.categories: for c in self.categories:

View File

@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse:
return redirect(url_for('tree', tree_uuid=tree_uuid)) return redirect(url_for('tree', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>/trigger_indexing', methods=['POST', 'GET'])
def trigger_indexing(tree_uuid: str) -> WerkzeugResponse:
cache = lookyloo.capture_cache(tree_uuid)
if cache and hasattr(cache, 'capture_dir'):
get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir)
return redirect(url_for('tree', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>', methods=['GET']) @app.route('/tree/<string:tree_uuid>', methods=['GET'])
@app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET']) @app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET'])
def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse: def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse:
@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
monitoring_collections = [] monitoring_collections = []
flash(f'Unable to get existing connections from the monitoring : {e}', 'warning') flash(f'Unable to get existing connections from the monitoring : {e}', 'warning')
# Check if the capture has been indexed yet. Print a warning if not.
capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid))
if not capture_indexed:
flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning')
return render_template('tree.html', tree_json=ct.to_json(), return render_template('tree.html', tree_json=ct.to_json(),
info=cache, info=cache,
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain, tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
confirm_message=confirm_message if confirm_message else 'Tick to confirm.', confirm_message=confirm_message if confirm_message else 'Tick to confirm.',
parent_uuid=cache.parent, parent_uuid=cache.parent,
has_redirects=True if cache.redirects else False, has_redirects=True if cache.redirects else False,
capture_indexed=capture_indexed,
capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {}) capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {})
except NoValidHarFile: except NoValidHarFile:

View File

@ -407,6 +407,10 @@
<div id="tools-menu" class="dropdown"> <div id="tools-menu" class="dropdown">
<button class="dropbtn">Analytical Tools</button> <button class="dropbtn">Analytical Tools</button>
<div id="tools-menu-content" class="dropdown-content"> <div id="tools-menu-content" class="dropdown-content">
{% if not capture_indexed %}
<a href="{{ url_for('trigger_indexing', tree_uuid=tree_uuid) }}" role="button" class="btn btn-warning"
title="The capture isn't (fully) indexed, index now.">Index capture</a>
{% endif %}
<a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}" <a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}"
data-bs-toggle="modal" data-bs-target="#modulesModal" role="button" data-bs-toggle="modal" data-bs-target="#modulesModal" role="button"
title="Lookups from supported 3rd party services">Third Party Reports</a> title="Lookups from supported 3rd party services">Third Party Reports</a>