new: Force indexing of a specific capture from the UI

This should also make the indexing a lot faster.
pull/937/head
Raphaël Vinot 2024-08-29 13:32:38 +02:00
parent 28e81a1eae
commit 1085932ad2
6 changed files with 172 additions and 133 deletions

View File

@ -4,14 +4,12 @@ from __future__ import annotations
import logging
import logging.config
from pathlib import Path
from redis import Redis
from typing import Generator
from lookyloo import Lookyloo, Indexing
from lookyloo.capturecache import get_pickle_path
from lookyloo import Indexing
from lookyloo.default import AbstractManager, get_config, get_socket_path
from lookyloo.exceptions import NoValidHarFile
logging.config.dictConfig(get_config('logging'))
@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager):
def __init__(self, full: bool=False, loglevel: int | None=None):
super().__init__(loglevel)
self.lookyloo = Lookyloo(cache_max_size=1)
self.is_public_instance = get_config('generic', 'public_instance')
self.full_indexer = full
self.indexing = Indexing(full_index=self.full_indexer)
@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager):
def _to_run_forever(self) -> None:
self._check_indexes()
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
# Capture unindexed
continue
if get_pickle_path(directory) is None:
# pickle isn't ready, we can't index.
continue
indexed = self.indexing.capture_indexed(uuid)
if all(indexed):
continue
yield indexed, uuid
def _check_indexes(self) -> None:
if not self.indexing.can_index:
if not self.indexing.can_index():
# There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.')
return None
self.logger.info(f'Check {self.script_name}...')
for indexed, uuid_to_index in self._to_index_no_cache():
try:
ct = self.lookyloo.get_crawled_tree(uuid_to_index)
except NoValidHarFile:
self.logger.warning(f'Broken pickle for {uuid_to_index}')
self.lookyloo.remove_pickle(uuid_to_index)
continue
# NOTE: only get the non-archived captures for now.
for uuid, d in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
if self.is_public_instance and self.redis.hexists(d, 'no_index'):
# Capture unindexed
continue
if not indexed[0]:
self.logger.info(f'Indexing urls for {uuid_to_index}')
self.indexing.index_url_capture(ct)
if not indexed[1]:
self.logger.info(f'Indexing resources for {uuid_to_index}')
self.indexing.index_body_hashes_capture(ct)
if not indexed[2]:
self.logger.info(f'Indexing cookies for {uuid_to_index}')
self.indexing.index_cookies_capture(ct)
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
self.indexing.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {uuid_to_index}')
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
self.indexing.index_favicons_capture(uuid_to_index, favicons)
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
if not indexed[6]:
self.logger.info(f'Indexing categories for {uuid_to_index}')
categories = self.lookyloo.categories_capture(uuid_to_index)
self.indexing.index_categories_capture(uuid_to_index, categories)
if not indexed[7]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.indexing.index_capture_hashes_types(ct)
self.indexing.index_capture(uuid, Path(d))
self.indexing.indexing_done()
self.logger.info('... done.')

View File

@ -16,8 +16,8 @@ import time
from collections import OrderedDict
from collections.abc import Mapping
from datetime import datetime
from functools import lru_cache, _CacheInfo as CacheInfo
from logging import Logger, LoggerAdapter
from functools import _CacheInfo as CacheInfo
from logging import LoggerAdapter
from pathlib import Path
from typing import Any, MutableMapping, Iterator
@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
from redis import Redis
from .context import Context
from .helpers import get_captures_dir, is_locked
from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
from .indexing import Indexing
from .default import LookylooException, try_make_file, get_config
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
@ -106,63 +106,6 @@ class CaptureCache():
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
def get_pickle_path(capture_dir: Path | str) -> Path | None:
if isinstance(capture_dir, str):
capture_dir = Path(capture_dir)
pickle_file_gz = capture_dir / 'tree.pickle.gz'
if pickle_file_gz.exists():
return pickle_file_gz
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
return pickle_file
return None
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_path = get_pickle_path(capture_dir)
if pickle_path and pickle_path.exists():
pickle_path.unlink()
@lru_cache(maxsize=64)
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
pickle_path = get_pickle_path(capture_dir)
tree = None
try:
if pickle_path:
if pickle_path.suffix == '.gz':
with gzip.open(pickle_path, 'rb') as _pg:
tree = pickle.load(_pg)
else: # not a GZ pickle
with pickle_path.open('rb') as _p:
tree = pickle.load(_p)
except pickle.UnpicklingError:
remove_pickle_tree(capture_dir)
except EOFError:
remove_pickle_tree(capture_dir)
except Exception:
logger.exception('Unexpected exception when unpickling.')
remove_pickle_tree(capture_dir)
if tree:
try:
if tree.root_hartree.har.path.exists():
return tree
else:
# The capture was moved.
remove_pickle_tree(capture_dir)
except Exception as e:
logger.warning(f'The pickle is broken, removing: {e}')
remove_pickle_tree(capture_dir)
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
# The tree doesn't need to be rebuilt if there are no HAR files.
raise NoValidHarFile("Couldn't find HAR files")
def serialize_sets(obj: Any) -> Any:
if isinstance(obj, set):
return list(obj)

View File

@ -3,10 +3,12 @@
from __future__ import annotations
import configparser
import gzip
import hashlib
import json
import logging
import os
import pickle
import re
import time
@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date
from functools import lru_cache, cache
from importlib.metadata import version
from io import BufferedIOBase
from logging import Logger
from pathlib import Path
from pydantic import field_validator
from pydantic_core import from_json
from typing import Any
from typing import Any, TYPE_CHECKING
from urllib.parse import urlparse
@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent
from werkzeug.utils import cached_property
from .default import get_homedir, safe_create_dir, get_config, LookylooException
from .indexing import Indexing
# from .exceptions import InvalidCaptureSetting
from .exceptions import NoValidHarFile, TreeNeedsRebuild
if TYPE_CHECKING:
from .indexing import Indexing
logger = logging.getLogger('Lookyloo - Helpers')
@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None:
@cache
def get_indexing(full: bool=False) -> Indexing:
from .indexing import Indexing
if not get_config('generic', 'index_everything'):
return Indexing()
if full:
return Indexing(full_index=True)
return Indexing()
def get_pickle_path(capture_dir: Path | str) -> Path | None:
if isinstance(capture_dir, str):
capture_dir = Path(capture_dir)
pickle_file_gz = capture_dir / 'tree.pickle.gz'
if pickle_file_gz.exists():
return pickle_file_gz
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
return pickle_file
return None
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_path = get_pickle_path(capture_dir)
if pickle_path and pickle_path.exists():
pickle_path.unlink()
@lru_cache(maxsize=64)
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
pickle_path = get_pickle_path(capture_dir)
tree = None
try:
if pickle_path:
if pickle_path.suffix == '.gz':
with gzip.open(pickle_path, 'rb') as _pg:
tree = pickle.load(_pg)
else: # not a GZ pickle
with pickle_path.open('rb') as _p:
tree = pickle.load(_p)
except pickle.UnpicklingError:
remove_pickle_tree(capture_dir)
except EOFError:
remove_pickle_tree(capture_dir)
except Exception:
logger.exception('Unexpected exception when unpickling.')
remove_pickle_tree(capture_dir)
if tree:
try:
if tree.root_hartree.har.path.exists():
return tree
else:
# The capture was moved.
remove_pickle_tree(capture_dir)
except Exception as e:
logger.warning(f'The pickle is broken, removing: {e}')
remove_pickle_tree(capture_dir)
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
# The tree doesn't need to be rebuilt if there are no HAR files.
raise NoValidHarFile("Couldn't find HAR files")

View File

@ -15,13 +15,15 @@ import mmh3
from bs4 import BeautifulSoup
from hashlib import sha256
from pathlib import Path
from har2tree import CrawledTree
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection
from .exceptions import NoValidHarFile, TreeNeedsRebuild
from .helpers import load_pickle_tree
from .default import get_socket_path, get_config
# from .helpers import get_public_suffix_list
class Indexing():
@ -53,12 +55,17 @@ class Indexing():
def redis(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.__redis_pool)
@property
def can_index(self) -> bool:
def can_index(self, capture_uuid: str | None=None) -> bool:
if capture_uuid:
return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True))
return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
def indexing_done(self) -> None:
self.redis.delete('ongoing_indexing')
def indexing_done(self, capture_uuid: str | None=None) -> None:
if capture_uuid:
self.redis.delete(f'ongoing_indexing|{capture_uuid}')
else:
self.redis.delete('ongoing_indexing')
def force_reindex(self, capture_uuid: str) -> None:
p = self.redis.pipeline()
@ -91,6 +98,55 @@ class Indexing():
# This call for sure returns a tuple of 7 booleans
return tuple(to_return) # type: ignore[return-value]
def index_capture(self, uuid_to_index: str, directory: Path) -> None:
if not self.can_index(uuid_to_index):
self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ')
return
try:
indexed = self.capture_indexed(uuid_to_index)
if all(indexed):
return
if not any((directory / pickle_name).exists()
for pickle_name in ['tree.pickle.gz', 'tree.pickle']):
self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ')
return
# do the indexing
ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger)
if not indexed[0]:
self.logger.info(f'Indexing urls for {uuid_to_index}')
self.index_url_capture(ct)
if not indexed[1]:
self.logger.info(f'Indexing resources for {uuid_to_index}')
self.index_body_hashes_capture(ct)
if not indexed[2]:
self.logger.info(f'Indexing cookies for {uuid_to_index}')
self.index_cookies_capture(ct)
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
self.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {uuid_to_index}')
self.index_favicons_capture(uuid_to_index, directory)
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.index_identifiers_capture(ct)
if not indexed[6]:
self.logger.info(f'Indexing categories for {uuid_to_index}')
self.index_categories_capture(uuid_to_index, directory)
if not indexed[7]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.index_capture_hashes_types(ct)
except (TreeNeedsRebuild, NoValidHarFile) as e:
self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
except Exception as e:
self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
finally:
self.indexing_done(uuid_to_index)
# ###### Cookies ######
@property
@ -349,18 +405,16 @@ class Indexing():
def favicon_number_captures(self, favicon_sha512: str) -> int:
return self.redis.scard(f'favicons|{favicon_sha512}|captures')
def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
if self.redis.sismember('indexed_favicons', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_favicons', capture_uuid)
self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
with favicon_path.open('rb') as f:
favicon = f.read()
if not favicon:
# Empty file, ignore.
continue
@ -552,11 +606,20 @@ class Indexing():
def categories(self) -> set[str]:
return self.redis.smembers('categories')
def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex
return
# Make sure we don't reindex
self.redis.sadd('indexed_categories', capture_uuid)
categ_file = capture_dir / 'categories'
if categ_file.exists():
with categ_file.open('r') as f:
capture_categories = [c.strip() for c in f.readlines()]
else:
return
added_in_existing_categories = set()
pipeline = self.redis.pipeline()
for c in self.categories:

View File

@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse:
return redirect(url_for('tree', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>/trigger_indexing', methods=['POST', 'GET'])
def trigger_indexing(tree_uuid: str) -> WerkzeugResponse:
cache = lookyloo.capture_cache(tree_uuid)
if cache and hasattr(cache, 'capture_dir'):
get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir)
return redirect(url_for('tree', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
@app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET'])
def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse:
@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
monitoring_collections = []
flash(f'Unable to get existing connections from the monitoring : {e}', 'warning')
# Check if the capture has been indexed yet. Print a warning if not.
capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid))
if not capture_indexed:
flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning')
return render_template('tree.html', tree_json=ct.to_json(),
info=cache,
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
confirm_message=confirm_message if confirm_message else 'Tick to confirm.',
parent_uuid=cache.parent,
has_redirects=True if cache.redirects else False,
capture_indexed=capture_indexed,
capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {})
except NoValidHarFile:

View File

@ -407,6 +407,10 @@
<div id="tools-menu" class="dropdown">
<button class="dropbtn">Analytical Tools</button>
<div id="tools-menu-content" class="dropdown-content">
{% if not capture_indexed %}
<a href="{{ url_for('trigger_indexing', tree_uuid=tree_uuid) }}" role="button" class="btn btn-warning"
title="The capture isn't (fully) indexed, index now.">Index capture</a>
{% endif %}
<a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}"
data-bs-toggle="modal" data-bs-target="#modulesModal" role="button"
title="Lookups from supported 3rd party services">Third Party Reports</a>