mirror of https://github.com/CIRCL/lookyloo
new: Force indexing of a specific capture from the UI
This should also make the indexing a lot faster.pull/937/head
parent
28e81a1eae
commit
1085932ad2
|
@ -4,14 +4,12 @@ from __future__ import annotations
|
|||
|
||||
import logging
|
||||
import logging.config
|
||||
from pathlib import Path
|
||||
|
||||
from redis import Redis
|
||||
from typing import Generator
|
||||
|
||||
from lookyloo import Lookyloo, Indexing
|
||||
from lookyloo.capturecache import get_pickle_path
|
||||
from lookyloo import Indexing
|
||||
from lookyloo.default import AbstractManager, get_config, get_socket_path
|
||||
from lookyloo.exceptions import NoValidHarFile
|
||||
|
||||
|
||||
logging.config.dictConfig(get_config('logging'))
|
||||
|
@ -21,7 +19,6 @@ class BackgroundIndexer(AbstractManager):
|
|||
|
||||
def __init__(self, full: bool=False, loglevel: int | None=None):
|
||||
super().__init__(loglevel)
|
||||
self.lookyloo = Lookyloo(cache_max_size=1)
|
||||
self.is_public_instance = get_config('generic', 'public_instance')
|
||||
self.full_indexer = full
|
||||
self.indexing = Indexing(full_index=self.full_indexer)
|
||||
|
@ -35,66 +32,22 @@ class BackgroundIndexer(AbstractManager):
|
|||
|
||||
def _to_run_forever(self) -> None:
|
||||
self._check_indexes()
|
||||
# Don't need the cache in this class.
|
||||
self.lookyloo.clear_tree_cache()
|
||||
|
||||
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
|
||||
# NOTE: only get the non-archived captures for now.
|
||||
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
|
||||
if not self.full_indexer:
|
||||
# If we're not running the full indexer, check if the capture should be indexed.
|
||||
if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
|
||||
# Capture unindexed
|
||||
continue
|
||||
|
||||
if get_pickle_path(directory) is None:
|
||||
# pickle isn't ready, we can't index.
|
||||
continue
|
||||
indexed = self.indexing.capture_indexed(uuid)
|
||||
if all(indexed):
|
||||
continue
|
||||
yield indexed, uuid
|
||||
|
||||
def _check_indexes(self) -> None:
|
||||
if not self.indexing.can_index:
|
||||
if not self.indexing.can_index():
|
||||
# There is no reason to run this method in multiple scripts.
|
||||
self.logger.info('Indexing already ongoing in another process.')
|
||||
return None
|
||||
self.logger.info(f'Check {self.script_name}...')
|
||||
for indexed, uuid_to_index in self._to_index_no_cache():
|
||||
try:
|
||||
ct = self.lookyloo.get_crawled_tree(uuid_to_index)
|
||||
except NoValidHarFile:
|
||||
self.logger.warning(f'Broken pickle for {uuid_to_index}')
|
||||
self.lookyloo.remove_pickle(uuid_to_index)
|
||||
continue
|
||||
# NOTE: only get the non-archived captures for now.
|
||||
for uuid, d in self.redis.hscan_iter('lookup_dirs'):
|
||||
if not self.full_indexer:
|
||||
# If we're not running the full indexer, check if the capture should be indexed.
|
||||
if self.is_public_instance and self.redis.hexists(d, 'no_index'):
|
||||
# Capture unindexed
|
||||
continue
|
||||
|
||||
if not indexed[0]:
|
||||
self.logger.info(f'Indexing urls for {uuid_to_index}')
|
||||
self.indexing.index_url_capture(ct)
|
||||
if not indexed[1]:
|
||||
self.logger.info(f'Indexing resources for {uuid_to_index}')
|
||||
self.indexing.index_body_hashes_capture(ct)
|
||||
if not indexed[2]:
|
||||
self.logger.info(f'Indexing cookies for {uuid_to_index}')
|
||||
self.indexing.index_cookies_capture(ct)
|
||||
if not indexed[3]:
|
||||
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
|
||||
self.indexing.index_http_headers_hashes_capture(ct)
|
||||
if not indexed[4]:
|
||||
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
||||
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
|
||||
self.indexing.index_favicons_capture(uuid_to_index, favicons)
|
||||
if not indexed[5]:
|
||||
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
|
||||
self.indexing.index_identifiers_capture(ct)
|
||||
if not indexed[6]:
|
||||
self.logger.info(f'Indexing categories for {uuid_to_index}')
|
||||
categories = self.lookyloo.categories_capture(uuid_to_index)
|
||||
self.indexing.index_categories_capture(uuid_to_index, categories)
|
||||
if not indexed[7]:
|
||||
self.logger.info(f'Indexing hash types for {uuid_to_index}')
|
||||
self.indexing.index_capture_hashes_types(ct)
|
||||
self.indexing.index_capture(uuid, Path(d))
|
||||
self.indexing.indexing_done()
|
||||
self.logger.info('... done.')
|
||||
|
||||
|
|
|
@ -16,8 +16,8 @@ import time
|
|||
from collections import OrderedDict
|
||||
from collections.abc import Mapping
|
||||
from datetime import datetime
|
||||
from functools import lru_cache, _CacheInfo as CacheInfo
|
||||
from logging import Logger, LoggerAdapter
|
||||
from functools import _CacheInfo as CacheInfo
|
||||
from logging import LoggerAdapter
|
||||
from pathlib import Path
|
||||
from typing import Any, MutableMapping, Iterator
|
||||
|
||||
|
@ -28,7 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
|
|||
from redis import Redis
|
||||
|
||||
from .context import Context
|
||||
from .helpers import get_captures_dir, is_locked
|
||||
from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
|
||||
from .indexing import Indexing
|
||||
from .default import LookylooException, try_make_file, get_config
|
||||
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
|
||||
|
@ -106,63 +106,6 @@ class CaptureCache():
|
|||
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
|
||||
|
||||
|
||||
def get_pickle_path(capture_dir: Path | str) -> Path | None:
|
||||
if isinstance(capture_dir, str):
|
||||
capture_dir = Path(capture_dir)
|
||||
pickle_file_gz = capture_dir / 'tree.pickle.gz'
|
||||
if pickle_file_gz.exists():
|
||||
return pickle_file_gz
|
||||
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
return pickle_file
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_path = get_pickle_path(capture_dir)
|
||||
if pickle_path and pickle_path.exists():
|
||||
pickle_path.unlink()
|
||||
|
||||
|
||||
@lru_cache(maxsize=64)
|
||||
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
|
||||
pickle_path = get_pickle_path(capture_dir)
|
||||
tree = None
|
||||
try:
|
||||
if pickle_path:
|
||||
if pickle_path.suffix == '.gz':
|
||||
with gzip.open(pickle_path, 'rb') as _pg:
|
||||
tree = pickle.load(_pg)
|
||||
else: # not a GZ pickle
|
||||
with pickle_path.open('rb') as _p:
|
||||
tree = pickle.load(_p)
|
||||
except pickle.UnpicklingError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except EOFError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except Exception:
|
||||
logger.exception('Unexpected exception when unpickling.')
|
||||
remove_pickle_tree(capture_dir)
|
||||
|
||||
if tree:
|
||||
try:
|
||||
if tree.root_hartree.har.path.exists():
|
||||
return tree
|
||||
else:
|
||||
# The capture was moved.
|
||||
remove_pickle_tree(capture_dir)
|
||||
except Exception as e:
|
||||
logger.warning(f'The pickle is broken, removing: {e}')
|
||||
remove_pickle_tree(capture_dir)
|
||||
|
||||
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
|
||||
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
|
||||
# The tree doesn't need to be rebuilt if there are no HAR files.
|
||||
raise NoValidHarFile("Couldn't find HAR files")
|
||||
|
||||
|
||||
def serialize_sets(obj: Any) -> Any:
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
|
|
|
@ -3,10 +3,12 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import configparser
|
||||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import time
|
||||
|
||||
|
@ -14,10 +16,11 @@ from datetime import datetime, timedelta, date
|
|||
from functools import lru_cache, cache
|
||||
from importlib.metadata import version
|
||||
from io import BufferedIOBase
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
from pydantic import field_validator
|
||||
from pydantic_core import from_json
|
||||
from typing import Any
|
||||
from typing import Any, TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
|
@ -31,9 +34,10 @@ from werkzeug.user_agent import UserAgent
|
|||
from werkzeug.utils import cached_property
|
||||
|
||||
from .default import get_homedir, safe_create_dir, get_config, LookylooException
|
||||
from .indexing import Indexing
|
||||
# from .exceptions import InvalidCaptureSetting
|
||||
from .exceptions import NoValidHarFile, TreeNeedsRebuild
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .indexing import Indexing
|
||||
|
||||
logger = logging.getLogger('Lookyloo - Helpers')
|
||||
|
||||
|
@ -441,8 +445,66 @@ def load_user_config(username: str) -> dict[str, Any] | None:
|
|||
|
||||
@cache
|
||||
def get_indexing(full: bool=False) -> Indexing:
|
||||
from .indexing import Indexing
|
||||
if not get_config('generic', 'index_everything'):
|
||||
return Indexing()
|
||||
if full:
|
||||
return Indexing(full_index=True)
|
||||
return Indexing()
|
||||
|
||||
|
||||
def get_pickle_path(capture_dir: Path | str) -> Path | None:
|
||||
if isinstance(capture_dir, str):
|
||||
capture_dir = Path(capture_dir)
|
||||
pickle_file_gz = capture_dir / 'tree.pickle.gz'
|
||||
if pickle_file_gz.exists():
|
||||
return pickle_file_gz
|
||||
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
return pickle_file
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_path = get_pickle_path(capture_dir)
|
||||
if pickle_path and pickle_path.exists():
|
||||
pickle_path.unlink()
|
||||
|
||||
|
||||
@lru_cache(maxsize=64)
|
||||
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
|
||||
pickle_path = get_pickle_path(capture_dir)
|
||||
tree = None
|
||||
try:
|
||||
if pickle_path:
|
||||
if pickle_path.suffix == '.gz':
|
||||
with gzip.open(pickle_path, 'rb') as _pg:
|
||||
tree = pickle.load(_pg)
|
||||
else: # not a GZ pickle
|
||||
with pickle_path.open('rb') as _p:
|
||||
tree = pickle.load(_p)
|
||||
except pickle.UnpicklingError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except EOFError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except Exception:
|
||||
logger.exception('Unexpected exception when unpickling.')
|
||||
remove_pickle_tree(capture_dir)
|
||||
|
||||
if tree:
|
||||
try:
|
||||
if tree.root_hartree.har.path.exists():
|
||||
return tree
|
||||
else:
|
||||
# The capture was moved.
|
||||
remove_pickle_tree(capture_dir)
|
||||
except Exception as e:
|
||||
logger.warning(f'The pickle is broken, removing: {e}')
|
||||
remove_pickle_tree(capture_dir)
|
||||
|
||||
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
|
||||
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
|
||||
# The tree doesn't need to be rebuilt if there are no HAR files.
|
||||
raise NoValidHarFile("Couldn't find HAR files")
|
||||
|
|
|
@ -15,13 +15,15 @@ import mmh3
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
|
||||
from har2tree import CrawledTree
|
||||
from redis import ConnectionPool, Redis
|
||||
from redis.connection import UnixDomainSocketConnection
|
||||
|
||||
from .exceptions import NoValidHarFile, TreeNeedsRebuild
|
||||
from .helpers import load_pickle_tree
|
||||
from .default import get_socket_path, get_config
|
||||
# from .helpers import get_public_suffix_list
|
||||
|
||||
|
||||
class Indexing():
|
||||
|
@ -53,12 +55,17 @@ class Indexing():
|
|||
def redis(self) -> Redis: # type: ignore[type-arg]
|
||||
return Redis(connection_pool=self.__redis_pool)
|
||||
|
||||
@property
|
||||
def can_index(self) -> bool:
|
||||
def can_index(self, capture_uuid: str | None=None) -> bool:
|
||||
if capture_uuid:
|
||||
return bool(self.redis.set(f'ongoing_indexing|{capture_uuid}', 1, ex=360, nx=True))
|
||||
|
||||
return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
|
||||
|
||||
def indexing_done(self) -> None:
|
||||
self.redis.delete('ongoing_indexing')
|
||||
def indexing_done(self, capture_uuid: str | None=None) -> None:
|
||||
if capture_uuid:
|
||||
self.redis.delete(f'ongoing_indexing|{capture_uuid}')
|
||||
else:
|
||||
self.redis.delete('ongoing_indexing')
|
||||
|
||||
def force_reindex(self, capture_uuid: str) -> None:
|
||||
p = self.redis.pipeline()
|
||||
|
@ -91,6 +98,55 @@ class Indexing():
|
|||
# This call for sure returns a tuple of 7 booleans
|
||||
return tuple(to_return) # type: ignore[return-value]
|
||||
|
||||
def index_capture(self, uuid_to_index: str, directory: Path) -> None:
|
||||
if not self.can_index(uuid_to_index):
|
||||
self.logger.info(f'Indexing on {uuid_to_index} ongoing, skipping. ')
|
||||
return
|
||||
|
||||
try:
|
||||
indexed = self.capture_indexed(uuid_to_index)
|
||||
if all(indexed):
|
||||
return
|
||||
|
||||
if not any((directory / pickle_name).exists()
|
||||
for pickle_name in ['tree.pickle.gz', 'tree.pickle']):
|
||||
self.logger.warning(f'No pickle for {uuid_to_index}, skipping. ')
|
||||
return
|
||||
|
||||
# do the indexing
|
||||
ct = load_pickle_tree(directory, directory.stat().st_mtime, self.logger)
|
||||
if not indexed[0]:
|
||||
self.logger.info(f'Indexing urls for {uuid_to_index}')
|
||||
self.index_url_capture(ct)
|
||||
if not indexed[1]:
|
||||
self.logger.info(f'Indexing resources for {uuid_to_index}')
|
||||
self.index_body_hashes_capture(ct)
|
||||
if not indexed[2]:
|
||||
self.logger.info(f'Indexing cookies for {uuid_to_index}')
|
||||
self.index_cookies_capture(ct)
|
||||
if not indexed[3]:
|
||||
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
|
||||
self.index_http_headers_hashes_capture(ct)
|
||||
if not indexed[4]:
|
||||
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
||||
self.index_favicons_capture(uuid_to_index, directory)
|
||||
if not indexed[5]:
|
||||
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
|
||||
self.index_identifiers_capture(ct)
|
||||
if not indexed[6]:
|
||||
self.logger.info(f'Indexing categories for {uuid_to_index}')
|
||||
self.index_categories_capture(uuid_to_index, directory)
|
||||
if not indexed[7]:
|
||||
self.logger.info(f'Indexing hash types for {uuid_to_index}')
|
||||
self.index_capture_hashes_types(ct)
|
||||
|
||||
except (TreeNeedsRebuild, NoValidHarFile) as e:
|
||||
self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
|
||||
except Exception as e:
|
||||
self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
|
||||
finally:
|
||||
self.indexing_done(uuid_to_index)
|
||||
|
||||
# ###### Cookies ######
|
||||
|
||||
@property
|
||||
|
@ -349,18 +405,16 @@ class Indexing():
|
|||
def favicon_number_captures(self, favicon_sha512: str) -> int:
|
||||
return self.redis.scard(f'favicons|{favicon_sha512}|captures')
|
||||
|
||||
def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
|
||||
def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
|
||||
if self.redis.sismember('indexed_favicons', capture_uuid):
|
||||
# Do not reindex
|
||||
return
|
||||
self.redis.sadd('indexed_favicons', capture_uuid)
|
||||
self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
|
||||
pipeline = self.redis.pipeline()
|
||||
with ZipFile(favicons, 'r') as myzip:
|
||||
for name in myzip.namelist():
|
||||
if not name.endswith('.ico'):
|
||||
continue
|
||||
favicon = myzip.read(name)
|
||||
for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
|
||||
with favicon_path.open('rb') as f:
|
||||
favicon = f.read()
|
||||
if not favicon:
|
||||
# Empty file, ignore.
|
||||
continue
|
||||
|
@ -552,11 +606,20 @@ class Indexing():
|
|||
def categories(self) -> set[str]:
|
||||
return self.redis.smembers('categories')
|
||||
|
||||
def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
|
||||
def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
|
||||
if self.redis.sismember('indexed_categories', capture_uuid):
|
||||
# do not reindex
|
||||
return
|
||||
# Make sure we don't reindex
|
||||
self.redis.sadd('indexed_categories', capture_uuid)
|
||||
|
||||
categ_file = capture_dir / 'categories'
|
||||
if categ_file.exists():
|
||||
with categ_file.open('r') as f:
|
||||
capture_categories = [c.strip() for c in f.readlines()]
|
||||
else:
|
||||
return
|
||||
|
||||
added_in_existing_categories = set()
|
||||
pipeline = self.redis.pipeline()
|
||||
for c in self.categories:
|
||||
|
|
|
@ -1142,6 +1142,14 @@ def send_mail(tree_uuid: str) -> WerkzeugResponse:
|
|||
return redirect(url_for('tree', tree_uuid=tree_uuid))
|
||||
|
||||
|
||||
@app.route('/tree/<string:tree_uuid>/trigger_indexing', methods=['POST', 'GET'])
|
||||
def trigger_indexing(tree_uuid: str) -> WerkzeugResponse:
|
||||
cache = lookyloo.capture_cache(tree_uuid)
|
||||
if cache and hasattr(cache, 'capture_dir'):
|
||||
get_indexing(flask_login.current_user).index_capture(tree_uuid, cache.capture_dir)
|
||||
return redirect(url_for('tree', tree_uuid=tree_uuid))
|
||||
|
||||
|
||||
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
|
||||
@app.route('/tree/<string:tree_uuid>/<string:node_uuid>', methods=['GET'])
|
||||
def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | WerkzeugResponse:
|
||||
|
@ -1199,6 +1207,11 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
|
|||
monitoring_collections = []
|
||||
flash(f'Unable to get existing connections from the monitoring : {e}', 'warning')
|
||||
|
||||
# Check if the capture has been indexed yet. Print a warning if not.
|
||||
capture_indexed = all(get_indexing(flask_login.current_user).capture_indexed(tree_uuid))
|
||||
if not capture_indexed:
|
||||
flash('The capture has not been indexed yet. Some correlations will be missing.', 'warning')
|
||||
|
||||
return render_template('tree.html', tree_json=ct.to_json(),
|
||||
info=cache,
|
||||
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
|
||||
|
@ -1221,6 +1234,7 @@ def tree(tree_uuid: str, node_uuid: str | None=None) -> Response | str | Werkzeu
|
|||
confirm_message=confirm_message if confirm_message else 'Tick to confirm.',
|
||||
parent_uuid=cache.parent,
|
||||
has_redirects=True if cache.redirects else False,
|
||||
capture_indexed=capture_indexed,
|
||||
capture_settings=capture_settings.model_dump(exclude_none=True) if capture_settings else {})
|
||||
|
||||
except NoValidHarFile:
|
||||
|
|
|
@ -407,6 +407,10 @@
|
|||
<div id="tools-menu" class="dropdown">
|
||||
<button class="dropbtn">Analytical Tools</button>
|
||||
<div id="tools-menu-content" class="dropdown-content">
|
||||
{% if not capture_indexed %}
|
||||
<a href="{{ url_for('trigger_indexing', tree_uuid=tree_uuid) }}" role="button" class="btn btn-warning"
|
||||
title="The capture isn't (fully) indexed, index now.">Index capture</a>
|
||||
{% endif %}
|
||||
<a href="#modulesModal" data-remote="{{ url_for('trigger_modules', tree_uuid=tree_uuid, force=False) }}"
|
||||
data-bs-toggle="modal" data-bs-target="#modulesModal" role="button"
|
||||
title="Lookups from supported 3rd party services">Third Party Reports</a>
|
||||
|
|
Loading…
Reference in New Issue