diff --git a/bin/background_indexer.py b/bin/background_indexer.py index ae525a2..2d468b1 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -5,15 +5,10 @@ from __future__ import annotations import logging import logging.config import os -import shutil -from datetime import datetime, timedelta -from pathlib import Path - -from lookyloo import Lookyloo +from lookyloo import Lookyloo, Indexing from lookyloo.default import AbstractManager, get_config -from lookyloo.exceptions import MissingUUID, NoValidHarFile -from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list +from lookyloo.exceptions import NoValidHarFile logging.config.dictConfig(get_config('logging')) @@ -21,125 +16,39 @@ logging.config.dictConfig(get_config('logging')) class BackgroundIndexer(AbstractManager): - def __init__(self, loglevel: int | None=None): + def __init__(self, full: bool=False, loglevel: int | None=None): super().__init__(loglevel) self.lookyloo = Lookyloo() - self.script_name = 'background_indexer' + self.full_indexer = full + self.indexing = Indexing(full_index=self.full_indexer) + if self.full_indexer: + self.script_name = 'background_full_indexer' + else: + self.script_name = 'background_indexer' # make sure discarded captures dir exists self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) def _to_run_forever(self) -> None: - all_done = self._build_missing_pickles() - if all_done: - self._check_indexes() - # Disable probabilistic indexing for now, mmh3 isn't a fuzzy hash ago. - # self._check_probabilistic_indexes() + self._check_indexes() self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) - def _build_missing_pickles(self) -> bool: - self.logger.debug('Build missing pickles...') - # Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time - # This value makes sure we break out of the loop and build pickles of the most recent captures - max_captures = 50 - got_new_captures = False - - # Initialize time where we do not want to build the pickles anymore. - archive_interval = timedelta(days=get_config('generic', 'archive')) - cut_time = (datetime.now() - archive_interval) - for month_dir in make_dirs_list(self.lookyloo.capture_dir): - __counter_shutdown = 0 - for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True): - __counter_shutdown += 1 - if __counter_shutdown % 10 and self.shutdown_requested(): - self.logger.warning('Shutdown requested, breaking.') - return False - if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()): - # We already have a pickle file - self.logger.debug(f'{path} has a pickle.') - continue - if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')): - # No HAR file - self.logger.debug(f'{path} has no HAR file.') - continue - - if is_locked(path): - # it is really locked - self.logger.debug(f'{path} is locked, pickle generated by another process.') - continue - - with (path / 'uuid').open() as f: - uuid = f.read() - - if not self.lookyloo.redis.hexists('lookup_dirs', uuid): - # The capture with this UUID exists, but it is for some reason missing in lookup_dirs - self.lookyloo.redis.hset('lookup_dirs', uuid, str(path)) - else: - cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type] - if cached_path != path: - # we have a duplicate UUID, it is proably related to some bad copy/paste - if cached_path.exists(): - # Both paths exist, move the one that isn't in lookup_dirs - self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest') - try: - shutil.move(str(path), str(self.discarded_captures_dir / path.name)) - except FileNotFoundError as e: - self.logger.warning(f'Unable to move capture: {e}') - continue - else: - # The path in lookup_dirs for that UUID doesn't exists, just update it. - self.lookyloo.redis.hset('lookup_dirs', uuid, str(path)) - - try: - self.logger.info(f'Build pickle for {uuid}: {path.name}') - self.lookyloo.get_crawled_tree(uuid) - self.lookyloo.trigger_modules(uuid, auto_trigger=True) - self.logger.info(f'Pickle for {uuid} built.') - got_new_captures = True - max_captures -= 1 - except MissingUUID: - self.logger.warning(f'Unable to find {uuid}. That should not happen.') - except NoValidHarFile as e: - self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}') - except FileNotFoundError: - self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.') - except Exception: - self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}') - # The capture is not working, moving it away. - try: - shutil.move(str(path), str(self.discarded_captures_dir / path.name)) - self.lookyloo.redis.hdel('lookup_dirs', uuid) - except FileNotFoundError as e: - self.logger.warning(f'Unable to move capture: {e}') - continue - if max_captures <= 0: - self.logger.info('Too many captures in the backlog, start from the beginning.') - return False - if got_new_captures: - self.logger.info('Finished building all missing pickles.') - # Only return True if we built new pickles. - return True - return False - def _check_indexes(self) -> None: - index_redis = self.lookyloo.indexing.redis - can_index = index_redis.set('ongoing_indexing', 1, ex=3600, nx=True) - if not can_index: + if not self.indexing.can_index: # There is no reason to run this method in multiple scripts. self.logger.info('Indexing already ongoing in another process.') return None - self.logger.info('Check indexes...') + self.logger.info(f'Check {self.script_name}...') for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False): - if self.lookyloo.is_public_instance and cache.no_index: - # Capture unindexed + if not self.full_indexer: + # If we're not running the full indexer, check if the capture should be indexed. + if self.lookyloo.is_public_instance and cache.no_index: + # Capture unindexed + continue + if not cache.tree_ready: + # pickle isn't ready, we can't index. continue - p = index_redis.pipeline() - p.sismember('indexed_urls', cache.uuid) - p.sismember('indexed_body_hashes', cache.uuid) - p.sismember('indexed_cookies', cache.uuid) - p.sismember('indexed_hhhashes', cache.uuid) - p.sismember('indexed_favicons', cache.uuid) - indexed = p.execute() + indexed = self.indexing.capture_indexed(cache.uuid) if all(indexed): continue try: @@ -151,50 +60,23 @@ class BackgroundIndexer(AbstractManager): if not indexed[0]: self.logger.info(f'Indexing urls for {cache.uuid}') - self.lookyloo.indexing.index_url_capture(ct) + self.indexing.index_url_capture(ct) if not indexed[1]: self.logger.info(f'Indexing resources for {cache.uuid}') - self.lookyloo.indexing.index_body_hashes_capture(ct) + self.indexing.index_body_hashes_capture(ct) if not indexed[2]: self.logger.info(f'Indexing cookies for {cache.uuid}') - self.lookyloo.indexing.index_cookies_capture(ct) + self.indexing.index_cookies_capture(ct) if not indexed[3]: self.logger.info(f'Indexing HH Hashes for {cache.uuid}') - self.lookyloo.indexing.index_http_headers_hashes_capture(ct) + self.indexing.index_http_headers_hashes_capture(ct) if not indexed[4]: self.logger.info(f'Indexing favicons for {cache.uuid}') favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False) - self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons) + self.indexing.index_favicons_capture(cache.uuid, favicons) # NOTE: categories aren't taken in account here, should be fixed(?) # see indexing.index_categories_capture(capture_uuid, categories) - index_redis.delete('ongoing_indexing') - self.logger.info('... done.') - - def _check_probabilistic_indexes(self) -> None: - index_redis = self.lookyloo.indexing.redis - can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True) - if not can_index: - # There is no reason to run this method in multiple scripts. - self.logger.info('Probalistic indexing already ongoing in another process.') - return None - self.logger.info('Check probabilistic indexes...') - algorithms = ['mmh3-shodan'] - for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False): - if self.lookyloo.is_public_instance and cache.no_index: - # Capture unindexed - continue - p = index_redis.pipeline() - for algorithm in algorithms: - p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid) - indexed = p.execute() - if all(indexed): - continue - for i, algorithm in enumerate(algorithms): - if not indexed[i]: - self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}') - favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False) - self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm) - index_redis.delete('ongoing_probalistic_indexing') + self.indexing.indexing_done() self.logger.info('... done.') @@ -203,5 +85,12 @@ def main() -> None: i.run(sleep_in_sec=60) +def main_full_indexer() -> None: + if not get_config('generic', 'index_everything'): + raise Exception('Full indexer is disabled.') + i = BackgroundIndexer(full=True) + i.run(sleep_in_sec=60) + + if __name__ == '__main__': main() diff --git a/bin/run_backend.py b/bin/run_backend.py index 014350f..4349b12 100755 --- a/bin/run_backend.py +++ b/bin/run_backend.py @@ -11,7 +11,7 @@ from subprocess import Popen from redis import Redis from redis.exceptions import ConnectionError -from lookyloo.default import get_homedir, get_socket_path +from lookyloo.default import get_homedir, get_socket_path, get_config def check_running(name: str) -> bool: @@ -55,13 +55,32 @@ def shutdown_indexing(storage_directory: Path | None=None) -> None: print('Redis indexing database shutdown.') +def launch_full_index(storage_directory: Path | None=None) -> None: + if not storage_directory: + storage_directory = get_homedir() + if not check_running('full_index'): + Popen(["./run_kvrocks.sh"], cwd=(storage_directory / 'full_index')) + + +def shutdown_full_index(storage_directory: Path | None=None) -> None: + if not storage_directory: + storage_directory = get_homedir() + r = Redis(unix_socket_path=get_socket_path('full_index')) + r.shutdown(save=True) + print('Kvrocks full indexing database shutdown.') + + def launch_all() -> None: launch_cache() launch_indexing() + if get_config('generic', 'index_everything'): + launch_full_index() def check_all(stop: bool=False) -> None: backends: dict[str, bool] = {'cache': False, 'indexing': False} + if get_config('generic', 'index_everything'): + backends['full_index'] = False while True: for db_name in backends.keys(): try: @@ -85,6 +104,8 @@ def check_all(stop: bool=False) -> None: def stop_all() -> None: shutdown_cache() shutdown_indexing() + if get_config('generic', 'index_everything'): + shutdown_full_index() def main() -> None: diff --git a/bin/start.py b/bin/start.py index 30fadd1..2ec0983 100755 --- a/bin/start.py +++ b/bin/start.py @@ -2,7 +2,7 @@ from subprocess import Popen, run -from lookyloo.default import get_homedir +from lookyloo.default import get_homedir, get_config def main() -> None: @@ -18,9 +18,16 @@ def main() -> None: print('Start asynchronous ingestor...') Popen(['async_capture']) print('done.') + print('Start background capture builder...') + Popen(['background_build_captures']) + print('done.') print('Start background indexer...') Popen(['background_indexer']) print('done.') + if get_config('generic', 'index_everything'): + print('Start background full indexer...') + Popen(['background_full_indexer']) + print('done.') print('Start background processing...') Popen(['processing']) print('done.') diff --git a/config/generic.json.sample b/config/generic.json.sample index 9e33fb7..a2db28d 100644 --- a/config/generic.json.sample +++ b/config/generic.json.sample @@ -79,6 +79,7 @@ "bucket_name": "" } }, + "index_everything": false, "_notes": { "loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels", "only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network", @@ -110,6 +111,7 @@ "archive": "The captures older than this value (in days) will be archived. They're not cached by default in the Lookyloo class.", "max_capture_time": "The very maximal time we allow a capture to keep going. Should only be triggered by captures that cause playwright to never quit.", "max_tree_create_time": "The max time the generation of a tree is allowed to take", - "s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage." + "s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage.", + "index_everything": "If true, index every capture, even if it's not public. This feature requires a dedicated kvrocks instance, and is only accessible when logged-in as admin." } } diff --git a/full_index/kvrocks.conf b/full_index/kvrocks.conf new file mode 100644 index 0000000..43c4428 --- /dev/null +++ b/full_index/kvrocks.conf @@ -0,0 +1,875 @@ +################################ GENERAL ##################################### + +# By default kvrocks listens for connections from localhost interface. +# It is possible to listen to just one or multiple interfaces using +# the "bind" configuration directive, followed by one or more IP addresses. +# +# Examples: +# +# bind 192.168.1.100 10.0.0.1 +# bind 127.0.0.1 ::1 +# bind 0.0.0.0 +# bind 127.0.0.1 + +# Unix socket. +# +# Specify the path for the unix socket that will be used to listen for +# incoming connections. There is no default, so kvrocks will not listen +# on a unix socket when not specified. +# +unixsocket full_index.sock +unixsocketperm 777 + +# Accept connections on the specified port, default is 6666. +# port 6666 + +# Close the connection after a client is idle for N seconds (0 to disable) +timeout 0 + +# The number of worker's threads, increase or decrease would affect the performance. +workers 8 + +# By default, kvrocks does not run as a daemon. Use 'yes' if you need it. +# Note that kvrocks will write a PID file in /var/run/kvrocks.pid when daemonized +daemonize yes + +# Kvrocks implements the cluster solution that is similar to the Redis cluster solution. +# You can get cluster information by CLUSTER NODES|SLOTS|INFO command, it also is +# adapted to redis-cli, redis-benchmark, Redis cluster SDK, and Redis cluster proxy. +# But kvrocks doesn't support communicating with each other, so you must set +# cluster topology by CLUSTER SETNODES|SETNODEID commands, more details: #219. +# +# PLEASE NOTE: +# If you enable cluster, kvrocks will encode key with its slot id calculated by +# CRC16 and modulo 16384, encoding key with its slot id makes it efficient to +# migrate keys based on the slot. So if you enabled at first time, cluster mode must +# not be disabled after restarting, and vice versa. That is to say, data is not +# compatible between standalone mode with cluster mode, you must migrate data +# if you want to change mode, otherwise, kvrocks will make data corrupt. +# +# Default: no + +cluster-enabled no + +# By default, namespaces are stored in the configuration file and won't be replicated +# to replicas. This option allows to change this behavior, so that namespaces are also +# propagated to slaves. Note that: +# 1) it won't replicate the 'masterauth' to prevent breaking master/replica replication +# 2) it will overwrite replica's namespace with master's namespace, so be careful of in-using namespaces +# 3) cannot switch off the namespace replication once it's enabled +# +# Default: no +repl-namespace-enabled no + +# Persist the cluster nodes topology in local file($dir/nodes.conf). This configuration +# takes effect only if the cluster mode was enabled. +# +# If yes, it will try to load the cluster topology from the local file when starting, +# and dump the cluster nodes into the file if it was changed. +# +# Default: yes +persist-cluster-nodes-enabled yes + +# Set the max number of connected clients at the same time. By default +# this limit is set to 10000 clients. However, if the server is not +# able to configure the process file limit to allow for the specified limit +# the max number of allowed clients is set to the current file limit +# +# Once the limit is reached the server will close all the new connections sending +# an error 'max number of clients reached'. +# +maxclients 10000 + +# Require clients to issue AUTH before processing any other +# commands. This might be useful in environments in which you do not trust +# others with access to the host running kvrocks. +# +# This should stay commented out for backward compatibility and because most +# people do not need auth (e.g. they run their own servers). +# +# Warning: since kvrocks is pretty fast an outside user can try up to +# 150k passwords per second against a good box. This means that you should +# use a very strong password otherwise it will be very easy to break. +# +# requirepass foobared + +# If the master is password protected (using the "masterauth" configuration +# directive below) it is possible to tell the slave to authenticate before +# starting the replication synchronization process. Otherwise, the master will +# refuse the slave request. +# +# masterauth foobared + +# Master-Salve replication would check db name is matched. if not, the slave should +# refuse to sync the db from master. Don't use the default value, set the db-name to identify +# the cluster. +db-name change.me.db + +# The working directory +# +# The DB will be written inside this directory +# Note that you must specify a directory here, not a file name. +dir ./ + +# You can configure where to store your server logs by the log-dir. +# If you don't specify one, we will use the above `dir` as our default log directory. +# We also can send logs to stdout/stderr is as simple as: +# +log-dir stdout + +# Log level +# Possible values: info, warning, error, fatal +# Default: info +log-level info + +# You can configure log-retention-days to control whether to enable the log cleaner +# and the maximum retention days that the INFO level logs will be kept. +# +# if set to -1, that means to disable the log cleaner. +# if set to 0, all previous INFO level logs will be immediately removed. +# if set to between 0 to INT_MAX, that means it will retent latest N(log-retention-days) day logs. + +# By default the log-retention-days is -1. +log-retention-days -1 + +# When running in daemonize mode, kvrocks writes a PID file in ${CONFIG_DIR}/kvrocks.pid by +# default. You can specify a custom pid file location here. +pidfile kvrocks.pid + +# You can configure a slave instance to accept writes or not. Writing against +# a slave instance may be useful to store some ephemeral data (because data +# written on a slave will be easily deleted after resync with the master) but +# may also cause problems if clients are writing to it because of a +# misconfiguration. +slave-read-only yes + +# The slave priority is an integer number published by Kvrocks in the INFO output. +# It is used by Redis Sentinel in order to select a slave to promote into a +# master if the master is no longer working correctly. +# +# A slave with a low priority number is considered better for promotion, so +# for instance if there are three slave with priority 10, 100, 25 Sentinel will +# pick the one with priority 10, that is the lowest. +# +# However a special priority of 0 marks the replica as not able to perform the +# role of master, so a slave with priority of 0 will never be selected by +# Redis Sentinel for promotion. +# +# By default the priority is 100. +slave-priority 100 + +# TCP listen() backlog. +# +# In high requests-per-second environments you need an high backlog in order +# to avoid slow clients connections issues. Note that the Linux kernel +# will silently truncate it to the value of /proc/sys/net/core/somaxconn so +# make sure to raise both the value of somaxconn and tcp_max_syn_backlog +# in order to Get the desired effect. +tcp-backlog 511 + +# If the master is an old version, it may have specified replication threads +# that use 'port + 1' as listening port, but in new versions, we don't use +# extra port to implement replication. In order to allow the new replicas to +# copy old masters, you should indicate that the master uses replication port +# or not. +# If yes, that indicates master uses replication port and replicas will connect +# to 'master's listening port + 1' when synchronization. +# If no, that indicates master doesn't use replication port and replicas will +# connect 'master's listening port' when synchronization. +master-use-repl-port no + +# Currently, master only checks sequence number when replica asks for PSYNC, +# that is not enough since they may have different replication histories even +# the replica asking sequence is in the range of the master current WAL. +# +# We design 'Replication Sequence ID' PSYNC, we add unique replication id for +# every write batch (the operation of each command on the storage engine), so +# the combination of replication id and sequence is unique for write batch. +# The master can identify whether the replica has the same replication history +# by checking replication id and sequence. +# +# By default, it is not enabled since this stricter check may easily lead to +# full synchronization. +use-rsid-psync no + +# Master-Slave replication. Use slaveof to make a kvrocks instance a copy of +# another kvrocks server. A few things to understand ASAP about kvrocks replication. +# +# 1) Kvrocks replication is asynchronous, but you can configure a master to +# stop accepting writes if it appears to be not connected with at least +# a given number of slaves. +# 2) Kvrocks slaves are able to perform a partial resynchronization with the +# master if the replication link is lost for a relatively small amount of +# time. You may want to configure the replication backlog size (see the next +# sections of this file) with a sensible value depending on your needs. +# 3) Replication is automatic and does not need user intervention. After a +# network partition slaves automatically try to reconnect to masters +# and resynchronize with them. +# +# slaveof +# slaveof 127.0.0.1 6379 + +# When a slave loses its connection with the master, or when the replication +# is still in progress, the slave can act in two different ways: +# +# 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will +# still reply to client requests, possibly with out-of-date data, or the +# data set may just be empty if this is the first synchronization. +# +# 2) if slave-serve-stale-data is set to 'no' the slave will reply with +# an error "SYNC with master in progress" to all kinds of commands +# but to INFO and SLAVEOF. +# +slave-serve-stale-data yes + +# To guarantee slave's data safe and serve when it is in full synchronization +# state, slave still keep itself data. But this way needs to occupy much disk +# space, so we provide a way to reduce disk occupation, slave will delete itself +# entire database before fetching files from master during full synchronization. +# If you want to enable this way, you can set 'slave-delete-db-before-fullsync' +# to yes, but you must know that database will be lost if master is down during +# full synchronization, unless you have a backup of database. +# +# This option is similar redis replicas RDB diskless load option: +# repl-diskless-load on-empty-db +# +# Default: no +slave-empty-db-before-fullsync no + +# A Kvrocks master is able to list the address and port of the attached +# replicas in different ways. For example the "INFO replication" section +# offers this information, which is used, among other tools, by +# Redis Sentinel in order to discover replica instances. +# Another place where this info is available is in the output of the +# "ROLE" command of a master. +# +# The listed IP address and port normally reported by a replica is +# obtained in the following way: +# +# IP: The address is auto detected by checking the peer address +# of the socket used by the replica to connect with the master. +# +# Port: The port is communicated by the replica during the replication +# handshake, and is normally the port that the replica is using to +# listen for connections. +# +# However when port forwarding or Network Address Translation (NAT) is +# used, the replica may actually be reachable via different IP and port +# pairs. The following two options can be used by a replica in order to +# report to its master a specific set of IP and port, so that both INFO +# and ROLE will report those values. +# +# There is no need to use both the options if you need to override just +# the port or the IP address. +# +# replica-announce-ip 5.5.5.5 +# replica-announce-port 1234 + +# If replicas need full synchronization with master, master need to create +# checkpoint for feeding replicas, and replicas also stage a checkpoint of +# the master. If we also keep the backup, it maybe occupy extra disk space. +# You can enable 'purge-backup-on-fullsync' if disk is not sufficient, but +# that may cause remote backup copy failing. +# +# Default: no +purge-backup-on-fullsync no + +# The maximum allowed rate (in MB/s) that should be used by replication. +# If the rate exceeds max-replication-mb, replication will slow down. +# Default: 0 (i.e. no limit) +max-replication-mb 0 + +# The maximum allowed aggregated write rate of flush and compaction (in MB/s). +# If the rate exceeds max-io-mb, io will slow down. +# 0 is no limit +# Default: 0 +max-io-mb 0 + +# The maximum allowed space (in GB) that should be used by RocksDB. +# If the total size of the SST files exceeds max_allowed_space, writes to RocksDB will fail. +# Please see: https://github.com/facebook/rocksdb/wiki/Managing-Disk-Space-Utilization +# Default: 0 (i.e. no limit) +max-db-size 0 + +# The maximum backup to keep, server cron would run every minutes to check the num of current +# backup, and purge the old backup if exceed the max backup num to keep. If max-backup-to-keep +# is 0, no backup would be kept. But now, we only support 0 or 1. +max-backup-to-keep 1 + +# The maximum hours to keep the backup. If max-backup-keep-hours is 0, wouldn't purge any backup. +# default: 1 day +max-backup-keep-hours 24 + +# max-bitmap-to-string-mb use to limit the max size of bitmap to string transformation(MB). +# +# Default: 16 +max-bitmap-to-string-mb 16 + +# Whether to enable SCAN-like cursor compatible with Redis. +# If enabled, the cursor will be unsigned 64-bit integers. +# If disabled, the cursor will be a string. +# Default: no +redis-cursor-compatible no + +# Maximum nesting depth allowed when parsing and serializing +# JSON documents while using JSON commands like JSON.SET. +# Default: 1024 +json-max-nesting-depth 1024 + +# The underlying storage format of JSON data type +# NOTE: This option only affects newly written/updated key-values +# The CBOR format may reduce the storage size and speed up JSON commands +# Available values: json, cbor +# Default: json +json-storage-format json + +################################## TLS ################################### + +# By default, TLS/SSL is disabled, i.e. `tls-port` is set to 0. +# To enable it, `tls-port` can be used to define TLS-listening ports. +# tls-port 0 + +# Configure a X.509 certificate and private key to use for authenticating the +# server to connected clients, masters or cluster peers. +# These files should be PEM formatted. +# +# tls-cert-file kvrocks.crt +# tls-key-file kvrocks.key + +# If the key file is encrypted using a passphrase, it can be included here +# as well. +# +# tls-key-file-pass secret + +# Configure a CA certificate(s) bundle or directory to authenticate TLS/SSL +# clients and peers. Kvrocks requires an explicit configuration of at least one +# of these, and will not implicitly use the system wide configuration. +# +# tls-ca-cert-file ca.crt +# tls-ca-cert-dir /etc/ssl/certs + +# By default, clients on a TLS port are required +# to authenticate using valid client side certificates. +# +# If "no" is specified, client certificates are not required and not accepted. +# If "optional" is specified, client certificates are accepted and must be +# valid if provided, but are not required. +# +# tls-auth-clients no +# tls-auth-clients optional + +# By default, only TLSv1.2 and TLSv1.3 are enabled and it is highly recommended +# that older formally deprecated versions are kept disabled to reduce the attack surface. +# You can explicitly specify TLS versions to support. +# Allowed values are case insensitive and include "TLSv1", "TLSv1.1", "TLSv1.2", +# "TLSv1.3" (OpenSSL >= 1.1.1) or any combination. +# To enable only TLSv1.2 and TLSv1.3, use: +# +# tls-protocols "TLSv1.2 TLSv1.3" + +# Configure allowed ciphers. See the ciphers(1ssl) manpage for more information +# about the syntax of this string. +# +# Note: this configuration applies only to <= TLSv1.2. +# +# tls-ciphers DEFAULT:!MEDIUM + +# Configure allowed TLSv1.3 ciphersuites. See the ciphers(1ssl) manpage for more +# information about the syntax of this string, and specifically for TLSv1.3 +# ciphersuites. +# +# tls-ciphersuites TLS_CHACHA20_POLY1305_SHA256 + +# When choosing a cipher, use the server's preference instead of the client +# preference. By default, the server follows the client's preference. +# +# tls-prefer-server-ciphers yes + +# By default, TLS session caching is enabled to allow faster and less expensive +# reconnections by clients that support it. Use the following directive to disable +# caching. +# +# tls-session-caching no + +# Change the default number of TLS sessions cached. A zero value sets the cache +# to unlimited size. The default size is 20480. +# +# tls-session-cache-size 5000 + +# Change the default timeout of cached TLS sessions. The default timeout is 300 +# seconds. +# +# tls-session-cache-timeout 60 + +# By default, a replica does not attempt to establish a TLS connection +# with its master. +# +# Use the following directive to enable TLS on replication links. +# +# tls-replication yes + +################################## SLOW LOG ################################### + +# The Kvrocks Slow Log is a mechanism to log queries that exceeded a specified +# execution time. The execution time does not include the I/O operations +# like talking with the client, sending the reply and so forth, +# but just the time needed to actually execute the command (this is the only +# stage of command execution where the thread is blocked and can not serve +# other requests in the meantime). +# +# You can configure the slow log with two parameters: one tells Kvrocks +# what is the execution time, in microseconds, to exceed in order for the +# command to get logged, and the other parameter is the length of the +# slow log. When a new command is logged the oldest one is removed from the +# queue of logged commands. + +# The following time is expressed in microseconds, so 1000000 is equivalent +# to one second. Note that -1 value disables the slow log, while +# a value of zero forces the logging of every command. +slowlog-log-slower-than 100000 + +# There is no limit to this length. Just be aware that it will consume memory. +# You can reclaim memory used by the slow log with SLOWLOG RESET. +slowlog-max-len 128 + +# If you run kvrocks from upstart or systemd, kvrocks can interact with your +# supervision tree. Options: +# supervised no - no supervision interaction +# supervised upstart - signal upstart by putting kvrocks into SIGSTOP mode +# supervised systemd - signal systemd by writing READY=1 to $NOTIFY_SOCKET +# supervised auto - detect upstart or systemd method based on +# UPSTART_JOB or NOTIFY_SOCKET environment variables +# Note: these supervision methods only signal "process is ready." +# They do not enable continuous liveness pings back to your supervisor. +supervised no + +################################## PERF LOG ################################### + +# The Kvrocks Perf Log is a mechanism to log queries' performance context that +# exceeded a specified execution time. This mechanism uses rocksdb's +# Perf Context and IO Stats Context, Please see: +# https://github.com/facebook/rocksdb/wiki/Perf-Context-and-IO-Stats-Context +# +# This mechanism is enabled when profiling-sample-commands is not empty and +# profiling-sample-ratio greater than 0. +# It is important to note that this mechanism affects performance, but it is +# useful for troubleshooting performance bottlenecks, so it should only be +# enabled when performance problems occur. + +# The name of the commands you want to record. Must be original name of +# commands supported by Kvrocks. Use ',' to separate multiple commands and +# use '*' to record all commands supported by Kvrocks. +# Example: +# - Single command: profiling-sample-commands get +# - Multiple commands: profiling-sample-commands get,mget,hget +# +# Default: empty +# profiling-sample-commands "" + +# Ratio of the samples would be recorded. It is a number between 0 and 100. +# We simply use the rand to determine whether to record the sample or not. +# +# Default: 0 +profiling-sample-ratio 0 + +# There is no limit to this length. Just be aware that it will consume memory. +# You can reclaim memory used by the perf log with PERFLOG RESET. +# +# Default: 256 +profiling-sample-record-max-len 256 + +# profiling-sample-record-threshold-ms use to tell the kvrocks when to record. +# +# Default: 100 millisecond +profiling-sample-record-threshold-ms 100 + +################################## CRON ################################### + +# Compact Scheduler, auto compact at schedule time +# time expression format is the same as crontab(currently only support * and int) +# e.g. compact-cron 0 3 * * * 0 4 * * * +# would compact the db at 3am and 4am everyday +# compact-cron 0 3 * * * + +# The hour range that compaction checker would be active +# e.g. compaction-checker-range 0-7 means compaction checker would be worker between +# 0-7am every day. +compaction-checker-range 0-7 + +# When the compaction checker is triggered, the db will periodically pick the SST file +# with the highest "deleted percentage" (i.e. the percentage of deleted keys in the SST +# file) to compact, in order to free disk space. +# However, if a specific SST file was created more than "force-compact-file-age" seconds +# ago, and its percentage of deleted keys is higher than +# "force-compact-file-min-deleted-percentage", it will be forcely compacted as well. + +# Default: 172800 seconds; Range: [60, INT64_MAX]; +# force-compact-file-age 172800 +# Default: 10 %; Range: [1, 100]; +# force-compact-file-min-deleted-percentage 10 + +# Bgsave scheduler, auto bgsave at scheduled time +# time expression format is the same as crontab(currently only support * and int) +# e.g. bgsave-cron 0 3 * * * 0 4 * * * +# would bgsave the db at 3am and 4am every day + +# Command renaming. +# +# It is possible to change the name of dangerous commands in a shared +# environment. For instance, the KEYS command may be renamed into something +# hard to guess so that it will still be available for internal-use tools +# but not available for general clients. +# +# Example: +# +# rename-command KEYS b840fc02d524045429941cc15f59e41cb7be6c52 +# +# It is also possible to completely kill a command by renaming it into +# an empty string: +# +# rename-command KEYS "" + +################################ MIGRATE ##################################### +# If the network bandwidth is completely consumed by the migration task, +# it will affect the availability of kvrocks. To avoid this situation, +# migrate-speed is adopted to limit the migrating speed. +# Migrating speed is limited by controlling the duration between sending data, +# the duration is calculated by: 1000000 * migrate-pipeline-size / migrate-speed (us). +# Value: [0,INT_MAX], 0 means no limit +# +# Default: 4096 +migrate-speed 4096 + +# In order to reduce data transmission times and improve the efficiency of data migration, +# pipeline is adopted to send multiple data at once. Pipeline size can be set by this option. +# Value: [1, INT_MAX], it can't be 0 +# +# Default: 16 +migrate-pipeline-size 16 + +# In order to reduce the write forbidden time during migrating slot, we will migrate the incremental +# data several times to reduce the amount of incremental data. Until the quantity of incremental +# data is reduced to a certain threshold, slot will be forbidden write. The threshold is set by +# this option. +# Value: [1, INT_MAX], it can't be 0 +# +# Default: 10000 +migrate-sequence-gap 10000 + +################################ ROCKSDB ##################################### + +# Specify the capacity of column family block cache. A larger block cache +# may make requests faster while more keys would be cached. Max Size is 400*1024. +# Default: 4096MB +rocksdb.block_cache_size 4096 + +# A global cache for table-level rows in RocksDB. If almost always point +# lookups, enlarging row cache may improve read performance. Otherwise, +# if we enlarge this value, we can lessen metadata/subkey block cache size. +# +# Default: 0 (disabled) +rocksdb.row_cache_size 0 + +# Number of open files that can be used by the DB. You may need to +# increase this if your database has a large working set. Value -1 means +# files opened are always kept open. You can estimate number of files based +# on target_file_size_base and target_file_size_multiplier for level-based +# compaction. For universal-style compaction, you can usually set it to -1. +# Default: 8096 +rocksdb.max_open_files 8096 + +# Amount of data to build up in memory (backed by an unsorted log +# on disk) before converting to a sorted on-disk file. +# +# Larger values increase performance, especially during bulk loads. +# Up to max_write_buffer_number write buffers may be held in memory +# at the same time, +# so you may wish to adjust this parameter to control memory usage. +# Also, a larger write buffer will result in a longer recovery time +# the next time the database is opened. +# +# Note that write_buffer_size is enforced per column family. +# See db_write_buffer_size for sharing memory across column families. + +# default is 64MB +rocksdb.write_buffer_size 64 + +# Target file size for compaction, target file size for Level N can be calculated +# by target_file_size_base * (target_file_size_multiplier ^ (L-1)) +# +# Default: 128MB +rocksdb.target_file_size_base 128 + +# The maximum number of write buffers that are built up in memory. +# The default and the minimum number is 2, so that when 1 write buffer +# is being flushed to storage, new writes can continue to the other +# write buffer. +# If max_write_buffer_number > 3, writing will be slowed down to +# options.delayed_write_rate if we are writing to the last write buffer +# allowed. +rocksdb.max_write_buffer_number 4 + +# Maximum number of concurrent background jobs (compactions and flushes). +# For backwards compatibility we will set `max_background_jobs = +# max_background_compactions + max_background_flushes` in the case where user +# sets at least one of `max_background_compactions` or `max_background_flushes` +# (we replace -1 by 1 in case one option is unset). +rocksdb.max_background_jobs 4 + +# DEPRECATED: it is automatically decided based on the value of rocksdb.max_background_jobs +# Maximum number of concurrent background compaction jobs, submitted to +# the default LOW priority thread pool. +rocksdb.max_background_compactions -1 + +# DEPRECATED: it is automatically decided based on the value of rocksdb.max_background_jobs +# Maximum number of concurrent background memtable flush jobs, submitted by +# default to the HIGH priority thread pool. If the HIGH priority thread pool +# is configured to have zero threads, flush jobs will share the LOW priority +# thread pool with compaction jobs. +rocksdb.max_background_flushes -1 + +# This value represents the maximum number of threads that will +# concurrently perform a compaction job by breaking it into multiple, +# smaller ones that are run simultaneously. +# Default: 2 +rocksdb.max_sub_compactions 2 + +# In order to limit the size of WALs, RocksDB uses DBOptions::max_total_wal_size +# as the trigger of column family flush. Once WALs exceed this size, RocksDB +# will start forcing the flush of column families to allow deletion of some +# oldest WALs. This config can be useful when column families are updated at +# non-uniform frequencies. If there's no size limit, users may need to keep +# really old WALs when the infrequently-updated column families hasn't flushed +# for a while. +# +# In kvrocks, we use multiple column families to store metadata, subkeys, etc. +# If users always use string type, but use list, hash and other complex data types +# infrequently, there will be a lot of old WALs if we don't set size limit +# (0 by default in rocksdb), because rocksdb will dynamically choose the WAL size +# limit to be [sum of all write_buffer_size * max_write_buffer_number] * 4 if set to 0. +# +# Moreover, you should increase this value if you already set rocksdb.write_buffer_size +# to a big value, to avoid influencing the effect of rocksdb.write_buffer_size and +# rocksdb.max_write_buffer_number. +# +# default is 512MB +rocksdb.max_total_wal_size 512 + +# We implement the replication with rocksdb WAL, it would trigger full sync when the seq was out of range. +# wal_ttl_seconds and wal_size_limit_mb would affect how archived logs will be deleted. +# If WAL_ttl_seconds is not 0, then WAL files will be checked every WAL_ttl_seconds / 2 and those that +# are older than WAL_ttl_seconds will be deleted# +# +# Default: 3 Hours +rocksdb.wal_ttl_seconds 10800 + +# If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, +# WAL files will be checked every 10 min and if total size is greater +# then WAL_size_limit_MB, they will be deleted starting with the +# earliest until size_limit is met. All empty files will be deleted +# Default: 16GB +rocksdb.wal_size_limit_mb 16384 + +# Approximate size of user data packed per block. Note that the +# block size specified here corresponds to uncompressed data. The +# actual size of the unit read from disk may be smaller if +# compression is enabled. +# +# Default: 16KB +rocksdb.block_size 16384 + +# Indicating if we'd put index/filter blocks to the block cache +# +# Default: yes +rocksdb.cache_index_and_filter_blocks yes + +# Specify the compression to use. Only compress level greater +# than 2 to improve performance. +# Accept value: "no", "snappy", "lz4", "zstd", "zlib" +# default snappy +rocksdb.compression snappy + +# If non-zero, we perform bigger reads when doing compaction. If you're +# running RocksDB on spinning disks, you should set this to at least 2MB. +# That way RocksDB's compaction is doing sequential instead of random reads. +# When non-zero, we also force new_table_reader_for_compaction_inputs to +# true. +# +# Default: 2 MB +rocksdb.compaction_readahead_size 2097152 + +# he limited write rate to DB if soft_pending_compaction_bytes_limit or +# level0_slowdown_writes_trigger is triggered. + +# If the value is 0, we will infer a value from `rater_limiter` value +# if it is not empty, or 16MB if `rater_limiter` is empty. Note that +# if users change the rate in `rate_limiter` after DB is opened, +# `delayed_write_rate` won't be adjusted. +# +rocksdb.delayed_write_rate 0 +# If enable_pipelined_write is true, separate write thread queue is +# maintained for WAL write and memtable write. +# +# Default: no +rocksdb.enable_pipelined_write no + +# Soft limit on number of level-0 files. We start slowing down writes at this +# point. A value <0 means that no writing slow down will be triggered by +# number of files in level-0. +# +# Default: 20 +rocksdb.level0_slowdown_writes_trigger 20 + +# Maximum number of level-0 files. We stop writes at this point. +# +# Default: 40 +rocksdb.level0_stop_writes_trigger 40 + +# Number of files to trigger level-0 compaction. +# +# Default: 4 +rocksdb.level0_file_num_compaction_trigger 4 + +# if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec +# +# Default: 0 +rocksdb.stats_dump_period_sec 0 + +# if yes, the auto compaction would be disabled, but the manual compaction remain works +# +# Default: no +rocksdb.disable_auto_compactions no + +# BlobDB(key-value separation) is essentially RocksDB for large-value use cases. +# Since 6.18.0, The new implementation is integrated into the RocksDB core. +# When set, large values (blobs) are written to separate blob files, and only +# pointers to them are stored in SST files. This can reduce write amplification +# for large-value use cases at the cost of introducing a level of indirection +# for reads. Please see: https://github.com/facebook/rocksdb/wiki/BlobDB. +# +# Note that when enable_blob_files is set to yes, BlobDB-related configuration +# items will take effect. +# +# Default: no +rocksdb.enable_blob_files no + +# The size of the smallest value to be stored separately in a blob file. Values +# which have an uncompressed size smaller than this threshold are stored alongside +# the keys in SST files in the usual fashion. +# +# Default: 4096 byte, 0 means that all values are stored in blob files +rocksdb.min_blob_size 4096 + +# The size limit for blob files. When writing blob files, a new file is +# opened once this limit is reached. +# +# Default: 268435456 bytes +rocksdb.blob_file_size 268435456 + +# Enables garbage collection of blobs. Valid blobs residing in blob files +# older than a cutoff get relocated to new files as they are encountered +# during compaction, which makes it possible to clean up blob files once +# they contain nothing but obsolete/garbage blobs. +# See also rocksdb.blob_garbage_collection_age_cutoff below. +# +# Default: yes +rocksdb.enable_blob_garbage_collection yes + +# The percentage cutoff in terms of blob file age for garbage collection. +# Blobs in the oldest N blob files will be relocated when encountered during +# compaction, where N = (garbage_collection_cutoff/100) * number_of_blob_files. +# Note that this value must belong to [0, 100]. +# +# Default: 25 +rocksdb.blob_garbage_collection_age_cutoff 25 + + +# The purpose of the following three options are to dynamically adjust the upper limit of +# the data that each layer can store according to the size of the different +# layers of the LSM. Enabling this option will bring some improvements in +# deletion efficiency and space amplification, but it will lose a certain +# amount of read performance. +# If you want to know more details about Levels' Target Size, you can read RocksDB wiki: +# https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#levels-target-size +# +# Default: yes +rocksdb.level_compaction_dynamic_level_bytes yes + +# The total file size of level-1 sst. +# +# Default: 268435456 bytes +rocksdb.max_bytes_for_level_base 268435456 + +# Multiplication factor for the total file size of L(n+1) layers. +# This option is a double type number in RocksDB, but kvrocks is +# not support the double data type number yet, so we use integer +# number instead of double currently. +# +# Default: 10 +rocksdb.max_bytes_for_level_multiplier 10 + +# This feature only takes effect in Iterators and MultiGet. +# If yes, RocksDB will try to read asynchronously and in parallel as much as possible to hide IO latency. +# In iterators, it will prefetch data asynchronously in the background for each file being iterated on. +# In MultiGet, it will read the necessary data blocks from those files in parallel as much as possible. + +# Default no +rocksdb.read_options.async_io no + +# If yes, the write will be flushed from the operating system +# buffer cache before the write is considered complete. +# If this flag is enabled, writes will be slower. +# If this flag is disabled, and the machine crashes, some recent +# rites may be lost. Note that if it is just the process that +# crashes (i.e., the machine does not reboot), no writes will be +# lost even if sync==false. +# +# Default: no +rocksdb.write_options.sync no + +# If yes, writes will not first go to the write ahead log, +# and the write may get lost after a crash. +# +# Default: no +rocksdb.write_options.disable_wal no + +# If enabled and we need to wait or sleep for the write request, fails +# immediately. +# +# Default: no +rocksdb.write_options.no_slowdown no + +# If enabled, write requests are of lower priority if compaction is +# behind. In this case, no_slowdown = true, the request will be canceled +# immediately. Otherwise, it will be slowed down. +# The slowdown value is determined by RocksDB to guarantee +# it introduces minimum impacts to high priority writes. +# +# Default: no +rocksdb.write_options.low_pri no + +# If enabled, this writebatch will maintain the last insert positions of each +# memtable as hints in concurrent write. It can improve write performance +# in concurrent writes if keys in one writebatch are sequential. +# +# Default: no +rocksdb.write_options.memtable_insert_hint_per_batch no + + +# Support RocksDB auto-tune rate limiter for the background IO +# if enabled, Rate limiter will limit the compaction write if flush write is high +# Please see https://rocksdb.org/blog/2017/12/18/17-auto-tuned-rate-limiter.html +# +# Default: yes +rocksdb.rate_limiter_auto_tuned yes + +# Enable this option will schedule the deletion of obsolete files in a background thread +# on iterator destruction. It can reduce the latency if there are many files to be removed. +# see https://github.com/facebook/rocksdb/wiki/IO#avoid-blocking-io +# +# Default: yes +# rocksdb.avoid_unnecessary_blocking_io yes + +################################ NAMESPACE ##################################### +# namespace.test change.me +backup-dir .//backup diff --git a/full_index/run_kvrocks.sh b/full_index/run_kvrocks.sh new file mode 100755 index 0000000..0475498 --- /dev/null +++ b/full_index/run_kvrocks.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e +set -x + +../../kvrocks/build/kvrocks -c kvrocks.conf diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index ff88994..60cbcfc 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -93,6 +93,10 @@ class CaptureCache(): self.user_agent: str | None = cache_entry.get('user_agent') self.referer: str | None = cache_entry.get('referer') + @property + def tree_ready(self) -> bool: + return bool(_pickle_path(self.capture_dir)) + @property def tree(self) -> CrawledTree: if not self.capture_dir.exists(): @@ -102,27 +106,36 @@ class CaptureCache(): return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger) -def remove_pickle_tree(capture_dir: Path) -> None: - pickle_file = capture_dir / 'tree.pickle' +def _pickle_path(capture_dir: Path) -> Path | None: pickle_file_gz = capture_dir / 'tree.pickle.gz' - if pickle_file.exists(): - pickle_file.unlink() if pickle_file_gz.exists(): - pickle_file_gz.unlink() + return pickle_file_gz + + pickle_file = capture_dir / 'tree.pickle' + if pickle_file.exists(): + return pickle_file + + return None + + +def remove_pickle_tree(capture_dir: Path) -> None: + pickle_path = _pickle_path(capture_dir) + if pickle_path and pickle_path.exists(): + pickle_path.unlink() @lru_cache(maxsize=64) def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree: - pickle_file = capture_dir / 'tree.pickle' - pickle_file_gz = capture_dir / 'tree.pickle.gz' + pickle_path = _pickle_path(capture_dir) tree = None try: - if pickle_file.exists(): - with pickle_file.open('rb') as _p: - tree = pickle.load(_p) - elif pickle_file_gz.exists(): - with gzip.open(pickle_file_gz, 'rb') as _pg: - tree = pickle.load(_pg) + if pickle_path: + if pickle_path.suffix == '.gz': + with gzip.open(pickle_path, 'rb') as _pg: + tree = pickle.load(_pg) + else: # not a GZ pickle + with pickle_path.open('rb') as _p: + tree = pickle.load(_p) except pickle.UnpicklingError: remove_pickle_tree(capture_dir) except EOFError: diff --git a/lookyloo/default/helpers.py b/lookyloo/default/helpers.py index 64ca095..4631a8f 100644 --- a/lookyloo/default/helpers.py +++ b/lookyloo/default/helpers.py @@ -95,8 +95,10 @@ def safe_create_dir(to_create: Path) -> None: def get_socket_path(name: str) -> str: mapping = { 'cache': Path('cache', 'cache.sock'), - 'indexing': Path('indexing', 'indexing.sock'), + 'indexing': Path('indexing', 'indexing.sock') } + if get_config('generic', 'index_everything'): + mapping['full_index'] = Path('full_index', 'full_index.sock') return str(get_homedir() / mapping[name]) diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 917a3d8..de05c77 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -24,24 +24,49 @@ from .default import get_socket_path, get_config class Indexing(): - def __init__(self) -> None: + def __init__(self, full_index: bool=False) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) - self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection, - path=get_socket_path('indexing')) - self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection, - path=get_socket_path('indexing'), decode_responses=True) + self.__redis_pool_bytes: ConnectionPool + self.__redis_pool: ConnectionPool + if full_index: + self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection, + path=get_socket_path('full_index')) + self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection, + path=get_socket_path('full_index'), decode_responses=True) + else: + self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection, + path=get_socket_path('indexing')) + self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection, + path=get_socket_path('indexing'), decode_responses=True) def clear_indexes(self) -> None: self.redis.flushdb() @property def redis_bytes(self) -> Redis: # type: ignore[type-arg] - return Redis(connection_pool=self.redis_pool_bytes) + return Redis(connection_pool=self.__redis_pool_bytes) @property def redis(self) -> Redis: # type: ignore[type-arg] - return Redis(connection_pool=self.redis_pool) + return Redis(connection_pool=self.__redis_pool) + + @property + def can_index(self) -> bool: + return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True)) + + def indexing_done(self) -> None: + self.redis.delete('ongoing_indexing') + + def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]: + p = self.redis.pipeline() + p.sismember('indexed_urls', capture_uuid) + p.sismember('indexed_body_hashes', capture_uuid) + p.sismember('indexed_cookies', capture_uuid) + p.sismember('indexed_hhhashes', capture_uuid) + p.sismember('indexed_favicons', capture_uuid) + # This call for sure returns a tuple of 5 booleans + return p.execute() # type: ignore[return-value] def new_internal_uuids(self, crawled_tree: CrawledTree) -> None: # only trigger this method if the capture was already indexed. diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 15471e9..f4c3513 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -56,7 +56,6 @@ from .helpers import (get_captures_dir, get_email_template, get_resources_hashes, get_taxonomies, uniq_domains, ParsedUserAgent, load_cookies, UserAgents, get_useragent_for_requests, make_ts_from_dirname) -from .indexing import Indexing from .modules import (MISPs, PhishingInitiative, UniversalWhois, UrlScan, VirusTotal, Phishtank, Hashlookup, RiskIQ, RiskIQError, Pandora, URLhaus, CIRCLPDNS) @@ -81,7 +80,6 @@ class Lookyloo(): def __init__(self) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) - self.indexing = Indexing() self.user_agents = UserAgents() self.is_public_instance = get_config('generic', 'public_instance') self.public_domain = get_config('generic', 'public_domain') @@ -938,214 +936,10 @@ class Lookyloo(): return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page) - set(ct.root_hartree.all_url_requests.keys())) - def get_body_hash_investigator(self, body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]: - '''Returns all the captures related to a hash (sha512), used in the web interface.''' - total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1) - captures = [] - for capture_uuid, hostnode_uuid, hostname, _, url in details: - cache = self.capture_cache(capture_uuid) - if not cache: - continue - captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url)) - domains = self.indexing.get_body_hash_domains(body_hash) - return captures, domains - - def get_body_hash_full(self, body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]: - '''Returns a lot of information about the hash (sha512) and the hits in the instance. - Also contains the data (base64 encoded)''' - details = self.indexing.get_body_hash_urls(body_hash) - - # Break immediately if we have the hash of the empty file - if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e': - return details, BytesIO() - - # get the body from the first entry in the details list - for _, entries in details.items(): - if not entries: - continue - ct = self.get_crawled_tree(entries[0]['capture']) - try: - urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode']) - except Exception: - # Unable to find URLnode in the tree, it probably has been rebuild. - self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]') - continue - - # From that point, we just try to get the content. Break as soon as we found one. - if urlnode.body_hash == body_hash: - # the hash we're looking for is the whole file - return details, urlnode.body - else: - # The hash is an embedded resource - for _, blobs in urlnode.embedded_ressources.items(): - for h, b in blobs: - if h == body_hash: - return details, b - - # TODO: Couldn't find the file anywhere. Maybe return a warning in the file? - return details, BytesIO() - - def get_all_body_hashes(self, capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]: - ct = self.get_crawled_tree(capture_uuid) - to_return: dict[str, dict[str, URLNode | int]] = defaultdict() - for node in ct.root_hartree.url_tree.traverse(): - if node.empty_response or node.body_hash in to_return: - # If we have the same hash more than once, skip - continue - total_captures, details = self.indexing.get_body_hash_captures(node.body_hash, limit=-1) - # Note for future: mayeb get url, capture title, something better than just the hash to show to the user - to_return[node.body_hash] = {'node': node, 'total_captures': total_captures} - return to_return - - def get_latest_url_capture(self, url: str, /) -> CaptureCache | None: - '''Get the most recent capture with this URL''' - captures = self.sorted_capture_cache(self.indexing.get_captures_url(url)) - if captures: - return captures[0] - return None - - def get_url_occurrences(self, url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: - '''Get the most recent captures and URL nodes where the URL has been seen.''' - captures = self.sorted_capture_cache(self.indexing.get_captures_url(url), cached_captures_only=cached_captures_only) - - to_return: list[dict[str, Any]] = [] - for capture in captures[:limit]: - ct = self.get_crawled_tree(capture.uuid) - to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid, - 'start_timestamp': capture.timestamp.isoformat(), - 'title': capture.title} - urlnodes: dict[str, dict[str, str]] = {} - for urlnode in ct.root_hartree.url_tree.search_nodes(name=url): - urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), - 'hostnode_uuid': urlnode.hostnode_uuid} - if hasattr(urlnode, 'body_hash'): - urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash - to_append['urlnodes'] = urlnodes - to_return.append(to_append) - return to_return - - def get_hostname_occurrences(self, hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: - '''Get the most recent captures and URL nodes where the hostname has been seen.''' - captures = self.sorted_capture_cache(self.indexing.get_captures_hostname(hostname), cached_captures_only=cached_captures_only) - - to_return: list[dict[str, Any]] = [] - for capture in captures[:limit]: - ct = self.get_crawled_tree(capture.uuid) - to_append: dict[str, str | list[Any] | dict[str, Any]] = { - 'capture_uuid': capture.uuid, - 'start_timestamp': capture.timestamp.isoformat(), - 'title': capture.title} - hostnodes: list[str] = [] - if with_urls_occurrences: - urlnodes: dict[str, dict[str, str]] = {} - for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname): - hostnodes.append(hostnode.uuid) - if with_urls_occurrences: - for urlnode in hostnode.urls: - urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), - 'url': urlnode.name, - 'hostnode_uuid': urlnode.hostnode_uuid} - if hasattr(urlnode, 'body_hash'): - urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash - to_append['hostnodes'] = hostnodes - if with_urls_occurrences: - to_append['urlnodes'] = urlnodes - to_return.append(to_append) - return to_return - - def get_cookie_name_investigator(self, cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]: - '''Returns all the captures related to a cookie name entry, used in the web interface.''' - cached_captures = self.sorted_capture_cache([entry[0] for entry in self.indexing.get_cookies_names_captures(cookie_name)]) - captures = [(cache.uuid, cache.title) for cache in cached_captures] - domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain)) - for domain, freq in self.indexing.get_cookie_domains(cookie_name)] - return captures, domains - def compute_mmh3_shodan(self, favicon: bytes, /) -> str: b64 = base64.encodebytes(favicon) return str(mmh3.hash(b64)) - def get_favicon_investigator(self, favicon_sha512: str, - /, - get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]], - tuple[str, str, str], - dict[str, dict[str, dict[str, tuple[str, str]]]]]: - '''Returns all the captures related to a cookie name entry, used in the web interface.''' - cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)]) - captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] - favicon = self.indexing.get_favicon(favicon_sha512) - if favicon: - mimetype = from_string(favicon, mime=True) - b64_favicon = base64.b64encode(favicon).decode() - mmh3_shodan = self.compute_mmh3_shodan(favicon) - else: - mimetype = '' - b64_favicon = '' - mmh3_shodan = '' - - # For now, there is only one probabilistic hash algo for favicons, keeping it simple - probabilistic_hash_algos = ['mmh3-shodan'] - probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {} - if get_probabilistic: - for algo in probabilistic_hash_algos: - probabilistic_favicons[algo] = {} - for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512): - probabilistic_favicons[algo][mm3hash] = {} - for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash): - if sha512 == favicon_sha512: - # Skip entry if it is the same as the favicon we are investigating - continue - favicon = self.indexing.get_favicon(sha512) - if favicon: - mimetype = from_string(favicon, mime=True) - b64_favicon = base64.b64encode(favicon).decode() - probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon) - if not probabilistic_favicons[algo][mm3hash]: - # remove entry if it has no favicon - probabilistic_favicons[algo].pop(mm3hash) - if not probabilistic_favicons[algo]: - # remove entry if it has no hash - probabilistic_favicons.pop(algo) - return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons - - def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]: - '''Returns all the captures related to a cookie name entry, used in the web interface.''' - all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh)) - if cached_captures := self.sorted_capture_cache([entry for entry in all_captures]): - captures = [] - for cache in cached_captures: - try: - urlnode = self.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid]) - except Exception as e: - self.logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.') - self._captures_index.remove_pickle(cache.uuid) - continue - captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title)) - # get the headers and format them as they were in the response - urlnode = self.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid]) - headers = [(header["name"], header["value"]) for header in urlnode.response['headers']] - return captures, headers - return [], [] - - def hash_lookup(self, blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]: - '''Search all the captures a specific hash was seen. - If a URL is given, it splits the results if the hash is seen on the same URL or an other one. - Capture UUID avoids duplicates on the same capture''' - captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} - total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1, - prefered_uuids=set(self._captures_index.keys())) - for h_capture_uuid, url_uuid, url_hostname, same_url, url in details: - cache = self.capture_cache(h_capture_uuid) - if cache and hasattr(cache, 'title'): - if same_url: - captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) - else: - captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) - # Sort by timestamp by default - captures_list['same_url'].sort(key=lambda y: y[3]) - captures_list['different_url'].sort(key=lambda y: y[3]) - return total_captures, captures_list - def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: str | None) -> tuple[str, BytesIO, str] | None: '''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource''' try: @@ -1375,116 +1169,6 @@ class Lookyloo(): """Get the preconfigured devices from Playwright""" return get_devices() - def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> tuple[HostNode, list[dict[str, Any]]]: - '''Gather all the informations needed to display the Hostnode investigator popup.''' - - def normalize_known_content(h: str, /, known_content: dict[str, Any], url: URLNode) -> tuple[str | list[Any] | None, tuple[bool, Any] | None]: - ''' There are a few different sources to figure out known vs. legitimate content, - this method normalize it for the web interface.''' - known: str | list[Any] | None = None - legitimate: tuple[bool, Any] | None = None - if h not in known_content: - return known, legitimate - - if known_content[h]['type'] in ['generic', 'sanejs']: - known = known_content[h]['details'] - elif known_content[h]['type'] == 'legitimate_on_domain': - legit = False - if url.hostname in known_content[h]['details']: - legit = True - legitimate = (legit, known_content[h]['details']) - elif known_content[h]['type'] == 'malicious': - legitimate = (False, known_content[h]['details']) - - return known, legitimate - - ct = self.get_crawled_tree(capture_uuid) - hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid) - - known_content = self.context.find_known_content(hostnode) - self.uwhois.query_whois_hostnode(hostnode) - - urls: list[dict[str, Any]] = [] - for url in hostnode.urls: - # For the popup, we need: - # * https vs http - # * everything after the domain - # * the full URL - to_append: dict[str, Any] = { - 'encrypted': url.name.startswith('https'), - 'url_path': url.name.split('/', 3)[-1], - 'url_object': url, - } - - if not url.empty_response: - # Index lookup - # %%% Full body %%% - freq = self.indexing.body_hash_fequency(url.body_hash) - to_append['body_hash_details'] = freq - if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1: - to_append['body_hash_details']['other_captures'] = self.hash_lookup(url.body_hash, url.name, capture_uuid) - - # %%% Embedded ressources %%% - if hasattr(url, 'embedded_ressources') and url.embedded_ressources: - to_append['embedded_ressources'] = {} - for mimetype, blobs in url.embedded_ressources.items(): - for h, blob in blobs: - if h in to_append['embedded_ressources']: - # Skip duplicates - continue - freq_embedded = self.indexing.body_hash_fequency(h) - to_append['embedded_ressources'][h] = freq_embedded - to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes - to_append['embedded_ressources'][h]['type'] = mimetype - if freq_embedded['hash_freq'] > 1: - to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid) - for h in to_append['embedded_ressources'].keys(): - known, legitimate = normalize_known_content(h, known_content, url) - if known: - to_append['embedded_ressources'][h]['known_content'] = known - elif legitimate: - to_append['embedded_ressources'][h]['legitimacy'] = legitimate - - known, legitimate = normalize_known_content(url.body_hash, known_content, url) - if known: - to_append['known_content'] = known - elif legitimate: - to_append['legitimacy'] = legitimate - - # Optional: Cookies sent to server in request -> map to nodes who set the cookie in response - if hasattr(url, 'cookies_sent'): - to_display_sent: dict[str, set[Iterable[str | None]]] = defaultdict(set) - for cookie, contexts in url.cookies_sent.items(): - if not contexts: - # Locally created? - to_display_sent[cookie].add(('Unknown origin', )) - continue - for context in contexts: - to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid)) - to_append['cookies_sent'] = to_display_sent - - # Optional: Cookies received from server in response -> map to nodes who send the cookie in request - if hasattr(url, 'cookies_received'): - to_display_received: dict[str, dict[str, set[Iterable[str | None]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)} - for domain, c_received, is_3rd_party in url.cookies_received: - if c_received not in ct.root_hartree.cookies_sent: - # This cookie is never sent. - if is_3rd_party: - to_display_received['3rd_party'][c_received].add((domain, )) - else: - to_display_received['not_sent'][c_received].add((domain, )) - continue - - for url_node in ct.root_hartree.cookies_sent[c_received]: - if is_3rd_party: - to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid)) - else: - to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid)) - to_append['cookies_received'] = to_display_received - - urls.append(to_append) - return hostnode, urls - def get_stats(self) -> dict[str, list[Any]]: '''Gather statistics about the lookyloo instance''' today = date.today() diff --git a/poetry.lock b/poetry.lock index 375e51a..4c0facc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,25 +1,25 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. [[package]] name = "aiobotocore" -version = "2.11.2" +version = "2.12.1" description = "Async client for aws services using botocore and aiohttp" optional = false python-versions = ">=3.8" files = [ - {file = "aiobotocore-2.11.2-py3-none-any.whl", hash = "sha256:487fede588040bfa3a43df945275c28c1c73ca75bf705295adb9fbadd2e89be7"}, - {file = "aiobotocore-2.11.2.tar.gz", hash = "sha256:6dd7352248e3523019c5a54a395d2b1c31080697fc80a9ad2672de4eec8c7abd"}, + {file = "aiobotocore-2.12.1-py3-none-any.whl", hash = "sha256:6a9a3d646cf422f45fdc1e4256e78563ebffba64733bc9b8ca9123614e8ba9af"}, + {file = "aiobotocore-2.12.1.tar.gz", hash = "sha256:8706b28f16f93c541f6ed50352115a79d8f3499539f8d0bb70aa0f7a5379c1fe"}, ] [package.dependencies] aiohttp = ">=3.7.4.post0,<4.0.0" aioitertools = ">=0.5.1,<1.0.0" -botocore = ">=1.33.2,<1.34.35" +botocore = ">=1.34.41,<1.34.52" wrapt = ">=1.10.10,<2.0.0" [package.extras] -awscli = ["awscli (>=1.31.2,<1.32.35)"] -boto3 = ["boto3 (>=1.33.2,<1.34.35)"] +awscli = ["awscli (>=1.32.41,<1.32.52)"] +boto3 = ["boto3 (>=1.34.41,<1.34.52)"] [[package]] name = "aiohttp" @@ -308,13 +308,13 @@ WTForms = "*" [[package]] name = "botocore" -version = "1.34.34" +version = "1.34.51" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.8" files = [ - {file = "botocore-1.34.34-py3-none-any.whl", hash = "sha256:cd060b0d88ebb2b893f1411c1db7f2ba66cc18e52dcc57ad029564ef5fec437b"}, - {file = "botocore-1.34.34.tar.gz", hash = "sha256:54093dc97372bb7683f5c61a279aa8240408abf3b2cc494ae82a9a90c1b784b5"}, + {file = "botocore-1.34.51-py3-none-any.whl", hash = "sha256:01d5156247f991b3466a8404e3d7460a9ecbd9b214f9992d6ba797d9ddc6f120"}, + {file = "botocore-1.34.51.tar.gz", hash = "sha256:5086217442e67dd9de36ec7e87a0c663f76b7790d5fb6a12de565af95e87e319"}, ] [package.dependencies] @@ -1331,13 +1331,13 @@ test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.22)", "pa [[package]] name = "ipython" -version = "8.22.1" +version = "8.22.2" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.22.1-py3-none-any.whl", hash = "sha256:869335e8cded62ffb6fac8928e5287a05433d6462e3ebaac25f4216474dd6bc4"}, - {file = "ipython-8.22.1.tar.gz", hash = "sha256:39c6f9efc079fb19bfb0f17eee903978fe9a290b1b82d68196c641cecb76ea22"}, + {file = "ipython-8.22.2-py3-none-any.whl", hash = "sha256:3c86f284c8f3d8f2b6c662f885c4889a91df7cd52056fd02b7d8d6195d7f56e9"}, + {file = "ipython-8.22.2.tar.gz", hash = "sha256:2dcaad9049f9056f1fef63514f176c7d41f930daa78d05b82a176202818f2c14"}, ] [package.dependencies] @@ -1463,20 +1463,21 @@ referencing = ">=0.31.0" [[package]] name = "lacuscore" -version = "1.8.7" +version = "1.8.8" description = "Core of Lacus, usable as a module" optional = false python-versions = ">=3.8,<4.0" files = [ - {file = "lacuscore-1.8.7-py3-none-any.whl", hash = "sha256:1ac849b1308eb780f1976fdf21d6476bd911e6ae1e91f79d83612baef90afee8"}, - {file = "lacuscore-1.8.7.tar.gz", hash = "sha256:67268fb4da1282d1c7f747b02611dd5ee549644e034d0acba2173396ce0d0408"}, + {file = "lacuscore-1.8.8-py3-none-any.whl", hash = "sha256:04812225e101ec59b3d1dcc6d3474e3cd2f3fd656a72d619e7d7d238d00b5a27"}, + {file = "lacuscore-1.8.8.tar.gz", hash = "sha256:41949ff67d056f8ba717b649d8b45307ff7d38d4c38291cb1a8b80ca2ce94f6f"}, ] [package.dependencies] +async-timeout = {version = ">=4.0.3,<5.0.0", markers = "python_version < \"3.11\""} defang = ">=0.5.3,<0.6.0" dnspython = ">=2.6.1,<3.0.0" -playwrightcapture = {version = ">=1.23.8,<2.0.0", extras = ["recaptcha"]} -redis = {version = ">=5.0.1,<6.0.0", extras = ["hiredis"]} +playwrightcapture = {version = ">=1.23.9,<2.0.0", extras = ["recaptcha"]} +redis = {version = ">=5.0.2,<6.0.0", extras = ["hiredis"]} requests = ">=2.31.0,<3.0.0" ua-parser = ">=0.18.0,<0.19.0" @@ -2286,13 +2287,13 @@ test = ["pytest"] [[package]] name = "playwrightcapture" -version = "1.23.8" +version = "1.23.9" description = "A simple library to capture websites using playwright" optional = false python-versions = ">=3.8,<4.0" files = [ - {file = "playwrightcapture-1.23.8-py3-none-any.whl", hash = "sha256:f3e4d6c0355b013e465f9d3eea961b9431303a5de227a1388a7287c872203b9e"}, - {file = "playwrightcapture-1.23.8.tar.gz", hash = "sha256:d2caea8d7a16d739f28dc06bbbc12665be89d07d325bba6868dab5f8520db809"}, + {file = "playwrightcapture-1.23.9-py3-none-any.whl", hash = "sha256:0324f587605aa85ede1b71c12ec735383d932324f0e66ef35345c6e08734273c"}, + {file = "playwrightcapture-1.23.9.tar.gz", hash = "sha256:e7217fc2a6109f240918de977452c556f482822abb12f0db43fa28228d3c0c90"}, ] [package.dependencies] @@ -2339,13 +2340,13 @@ files = [ [[package]] name = "publicsuffixlist" -version = "0.10.0.20240214" +version = "0.10.0.20240305" description = "publicsuffixlist implement" optional = false python-versions = ">=2.6" files = [ - {file = "publicsuffixlist-0.10.0.20240214-py2.py3-none-any.whl", hash = "sha256:2c3b8da819571bb610328bda5b25d27fcbf6bc400896ca3c6502d291a16b32f4"}, - {file = "publicsuffixlist-0.10.0.20240214.tar.gz", hash = "sha256:45a206c5f9c1eccf138481280cfb0a67c2ccafc782ef89c7fd6dc6c4356230fe"}, + {file = "publicsuffixlist-0.10.0.20240305-py2.py3-none-any.whl", hash = "sha256:f6869119f8781501c0c625e59b4b65eb60e2ed5185cfd6c142c792f74ac47c21"}, + {file = "publicsuffixlist-0.10.0.20240305.tar.gz", hash = "sha256:6e79ea73b0278ce1b102f3ad6815f2a5b683864da9948ba0b0eab3180c419f7f"}, ] [package.extras] @@ -2571,13 +2572,13 @@ docs = ["Sphinx (<7.2)", "Sphinx (>=7.2,<8.0)"] [[package]] name = "pymisp" -version = "2.4.185" +version = "2.4.186" description = "Python API for MISP." optional = false python-versions = ">=3.8,<4.0" files = [ - {file = "pymisp-2.4.185-py3-none-any.whl", hash = "sha256:e2635a2be92321d4f812c7220bd955817e95a286343720f138b87892a827117a"}, - {file = "pymisp-2.4.185.tar.gz", hash = "sha256:3ccdc6ee48d26d82c77ba3f5d8fd41a79eaaef0ad5619f37a65b060e92f6da4c"}, + {file = "pymisp-2.4.186-py3-none-any.whl", hash = "sha256:bb8ae23d038848a86cf5d6a4c965dbed79e48cd6f671681b17f72410aecf07a0"}, + {file = "pymisp-2.4.186.tar.gz", hash = "sha256:bdf2d54b297ad890418179b044dd4ea79821fccef723823919d12262e9794ca3"}, ] [package.dependencies] @@ -2593,7 +2594,7 @@ requests = ">=2.31.0,<3.0.0" [package.extras] brotli = ["urllib3[brotli]"] docs = ["Sphinx (<7.2)", "Sphinx (>=7.2,<8.0)", "recommonmark (>=0.7.1,<0.8.0)", "sphinx-autodoc-typehints (>=2.0.0,<3.0.0)"] -email = ["RTFDE (>=0.1.1,<0.2.0)", "extract_msg (>=0.47.0,<0.48.0)", "oletools (>=0.60.1,<0.61.0)"] +email = ["RTFDE (>=0.1.1,<0.2.0)", "extract_msg (>=0.48.0,<0.49.0)", "oletools (>=0.60.1,<0.61.0)"] fileobjects = ["lief (>=0.14.1,<0.15.0)", "pydeep2 (>=0.5.1,<0.6.0)", "python-magic (>=0.4.27,<0.5.0)"] openioc = ["beautifulsoup4 (>=4.12.3,<5.0.0)"] pdfexport = ["reportlab (>=4.1.0,<5.0.0)"] @@ -2668,13 +2669,13 @@ requests = ">=2.31.0,<3.0.0" [[package]] name = "pysecuritytxt" -version = "1.2.2" +version = "1.3.0" description = "Python CLI and module for querying security.txt files on domains." optional = false python-versions = ">=3.8,<4.0" files = [ - {file = "pysecuritytxt-1.2.2-py3-none-any.whl", hash = "sha256:08d8750d82e9502ba949a6ea7bab355ca183cfc3cd722ed3e492ba35a8d4edda"}, - {file = "pysecuritytxt-1.2.2.tar.gz", hash = "sha256:31d4ea4814e2cdeffce304e7b6f9d58580e7fb6578c8694bb6f8c0df59e65b3d"}, + {file = "pysecuritytxt-1.3.0-py3-none-any.whl", hash = "sha256:9e4eb6b4fdca8f8471c80696c4d7642be24d44c8c3f627870ca9b7bd3f221cd5"}, + {file = "pysecuritytxt-1.3.0.tar.gz", hash = "sha256:3669be69e90672ed0d448b385e5fef49cb3a6a611d7e386d673c4f0e1cc3e83b"}, ] [package.dependencies] @@ -2712,13 +2713,13 @@ webui = ["Flask (>=2.0,<3.0)", "Flask-Bootstrap (>=3.3.7.1,<4.0.0.0)", "Flask-WT [[package]] name = "python-dateutil" -version = "2.8.2" +version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, ] [package.dependencies] @@ -2748,17 +2749,17 @@ files = [ [[package]] name = "redis" -version = "5.0.1" +version = "5.0.2" description = "Python client for Redis database and key-value store" optional = false python-versions = ">=3.7" files = [ - {file = "redis-5.0.1-py3-none-any.whl", hash = "sha256:ed4802971884ae19d640775ba3b03aa2e7bd5e8fb8dfaed2decce4d0fc48391f"}, - {file = "redis-5.0.1.tar.gz", hash = "sha256:0dab495cd5753069d3bc650a0dde8a8f9edde16fc5691b689a566eda58100d0f"}, + {file = "redis-5.0.2-py3-none-any.whl", hash = "sha256:4caa8e1fcb6f3c0ef28dba99535101d80934b7d4cd541bbb47f4a3826ee472d1"}, + {file = "redis-5.0.2.tar.gz", hash = "sha256:3f82cc80d350e93042c8e6e7a5d0596e4dd68715babffba79492733e1f367037"}, ] [package.dependencies] -async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2\""} +async-timeout = ">=4.0.3" hiredis = {version = ">=1.0.0", optional = true, markers = "extra == \"hiredis\""} [package.extras] @@ -2942,6 +2943,7 @@ optional = false python-versions = "*" files = [ {file = "requests-file-2.0.0.tar.gz", hash = "sha256:20c5931629c558fda566cacc10cfe2cd502433e628f568c34c80d96a0cc95972"}, + {file = "requests_file-2.0.0-py2.py3-none-any.whl", hash = "sha256:3e493d390adb44aa102ebea827a48717336d5268968c370eaf19abaf5cae13bf"}, ] [package.dependencies] @@ -2949,13 +2951,13 @@ requests = ">=1.0.0" [[package]] name = "rich" -version = "13.7.0" +version = "13.7.1" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.7.0" files = [ - {file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"}, - {file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"}, + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, ] [package.dependencies] @@ -3217,13 +3219,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "types-beautifulsoup4" -version = "4.12.0.20240106" +version = "4.12.0.20240229" description = "Typing stubs for beautifulsoup4" optional = false python-versions = ">=3.8" files = [ - {file = "types-beautifulsoup4-4.12.0.20240106.tar.gz", hash = "sha256:98d628985b71b140bd3bc22a8cb0ab603c2f2d08f20d37925965eb4a21739be8"}, - {file = "types_beautifulsoup4-4.12.0.20240106-py3-none-any.whl", hash = "sha256:cbdd60ab8aeac737ac014431b6e921b43e84279c0405fdd25a6900bb0e71da5b"}, + {file = "types-beautifulsoup4-4.12.0.20240229.tar.gz", hash = "sha256:e37e4cfa11b03b01775732e56d2c010cb24ee107786277bae6bc0fa3e305b686"}, + {file = "types_beautifulsoup4-4.12.0.20240229-py3-none-any.whl", hash = "sha256:000cdddb8aee4effb45a04be95654de8629fb8594a4f2f1231cff81108977324"}, ] [package.dependencies] @@ -3242,13 +3244,13 @@ files = [ [[package]] name = "types-html5lib" -version = "1.1.11.20240222" +version = "1.1.11.20240228" description = "Typing stubs for html5lib" optional = false python-versions = ">=3.8" files = [ - {file = "types-html5lib-1.1.11.20240222.tar.gz", hash = "sha256:d9517ec6ba2fa1f63113e2930a59b60722a976cc983b94d7fd772f14865e1152"}, - {file = "types_html5lib-1.1.11.20240222-py3-none-any.whl", hash = "sha256:86b2dcbbebca846e68d2eac46b2717980e632de4b5d8f62ccd23d8333d2e7647"}, + {file = "types-html5lib-1.1.11.20240228.tar.gz", hash = "sha256:22736b7299e605ec4ba539d48691e905fd0c61c3ea610acc59922232dc84cede"}, + {file = "types_html5lib-1.1.11.20240228-py3-none-any.whl", hash = "sha256:af5de0125cb0fe5667543b158db83849b22e25c0e36c9149836b095548bf1020"}, ] [[package]] @@ -3275,13 +3277,13 @@ files = [ [[package]] name = "types-pyopenssl" -version = "24.0.0.20240130" +version = "24.0.0.20240228" description = "Typing stubs for pyOpenSSL" optional = false python-versions = ">=3.8" files = [ - {file = "types-pyOpenSSL-24.0.0.20240130.tar.gz", hash = "sha256:c812e5c1c35249f75ef5935708b2a997d62abf9745be222e5f94b9595472ab25"}, - {file = "types_pyOpenSSL-24.0.0.20240130-py3-none-any.whl", hash = "sha256:24a255458b5b8a7fca8139cf56f2a8ad5a4f1a5f711b73a5bb9cb50dc688fab5"}, + {file = "types-pyOpenSSL-24.0.0.20240228.tar.gz", hash = "sha256:cd990717d8aa3743ef0e73e0f462e64b54d90c304249232d48fece4f0f7c3c6a"}, + {file = "types_pyOpenSSL-24.0.0.20240228-py3-none-any.whl", hash = "sha256:a472cf877a873549175e81972f153f44e975302a3cf17381eb5f3d41ccfb75a4"}, ] [package.dependencies] @@ -3734,4 +3736,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "bc64701a9d95985f7d0c91086fabfc29a1c54affc60bfab612fecc3771d6acd4" +content-hash = "7e76c4614efed850e101ecaa1e91f141649ef4ad508522f0323e8efffc9eda7d" diff --git a/pyproject.toml b/pyproject.toml index e1d8387..0c87a94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ shutdown = "bin.shutdown:main" run_backend = "bin.run_backend:main" async_capture = "bin.async_capture:main" background_indexer = "bin.background_indexer:main" +background_build_captures = "bin.background_build_captures:main" +background_full_indexer = "bin.background_indexer:main_full_indexer" archiver = "bin.archiver:main" processing = "bin.background_processing:main" start_website = "bin.start_website:main" @@ -40,7 +42,7 @@ requests = "^2.31.0" flask = "^3.0.2" gunicorn = "^21.2.0" charset-normalizer = "^3.3.2" -redis = {version = "^5.0.1", extras = ["hiredis"]} +redis = {version = "^5.0.2", extras = ["hiredis"]} beautifulsoup4 = {version = "^4.12.3", extras = ["lxml", "charset_normalizer"]} bootstrap-flask = "^2.3.3" defang = "^0.5.3" @@ -50,10 +52,10 @@ pysanejs = "^2.0.2" pylookyloo = "^1.23.1" dnspython = "^2.6.1" pytaxonomies = "^1.5.0" -pymisp = {version = "^2.4.185", extras = ["url", "fileobjects"]} +pymisp = {version = "^2.4.186", extras = ["url", "fileobjects"]} Pillow = "^10.2.0" flask-restx = "^1.3.0" -rich = "^13.7.0" +rich = "^13.7.1" pyphishtanklookup = "^1.3.2" Flask-Cors = "^4.0.0" pyhashlookup = "^1.2.2" @@ -65,13 +67,13 @@ passivetotal = "^2.5.9" werkzeug = "^3.0.1" filetype = "^1.2.0" pypandora = "^1.8.0" -lacuscore = "^1.8.7" +lacuscore = "^1.8.8" pylacus = "^1.8.0" pyipasnhistory = "^2.1.2" publicsuffixlist = "^0.10.0.20240205" pyfaup = "^1.2" chardet = "^5.2.0" -pysecuritytxt = "^1.2.2" +pysecuritytxt = "^1.3.0" pylookyloomonitoring = "^1.1.3" pytz = {"version" = "^2024.1", python = "<3.9"} s3fs = "^2024.2.0" @@ -98,7 +100,7 @@ types-redis = {version = "^4.6.0.20240218"} types-pkg-resources = "^0.1.3" types-Deprecated = "^1.2.9.20240106" types-python-dateutil = "^2.8.19.20240106" -types-beautifulsoup4 = "^4.12.0.20240106" +types-beautifulsoup4 = "^4.12.0.20240229" types-Pillow = "^10.2.0.20240213" types-pytz = "^2024.1.0.20240203" diff --git a/website/web/__init__.py b/website/web/__init__.py index 7980381..57c2a3d 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -17,14 +17,16 @@ import time import filetype # type: ignore[import-untyped] +from collections import defaultdict from datetime import date, datetime, timedelta, timezone from importlib.metadata import version from io import BytesIO, StringIO -from typing import Any, TypedDict +from typing import Any, TypedDict, Iterable from urllib.parse import quote_plus, unquote_plus, urlparse from uuid import uuid4 from zipfile import ZipFile +from har2tree import HostNode, URLNode import flask_login # type: ignore[import-untyped] from flask import (Flask, Response, Request, flash, jsonify, redirect, render_template, request, send_file, url_for) @@ -37,7 +39,8 @@ from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined] from werkzeug.security import check_password_hash from werkzeug.wrappers.response import Response as WerkzeugResponse -from lookyloo import Lookyloo, CaptureSettings +from lookyloo import Lookyloo, CaptureSettings, Indexing +from lookyloo.capturecache import CaptureCache from lookyloo.default import get_config from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.helpers import get_taxonomies, UserAgents, load_cookies @@ -262,6 +265,353 @@ def file_response(func): # type: ignore[no-untyped-def] return wrapper +# ##### Methods querying the indexes ##### + +@functools.cache +def get_indexing(user: User | None) -> Indexing: + '''Depending if we're logged in or not, we (can) get different indexes: + if index_everything is enabled, we have an index in kvrocks that contains all + the indexes for all the captures. + It is only accessible to the admin user. + ''' + if not get_config('generic', 'index_everything'): + return Indexing() + + if not user or not user.is_authenticated: + # No user or anonymous + return Indexing() + # Logged in user + return Indexing(full_index=True) + + +def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]: + '''Returns all the captures related to a hash (sha512), used in the web interface.''' + total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(body_hash, limit=-1) + captures = [] + for capture_uuid, hostnode_uuid, hostname, _, url in details: + cache = lookyloo.capture_cache(capture_uuid) + if not cache: + continue + captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url)) + domains = get_indexing(flask_login.current_user).get_body_hash_domains(body_hash) + return captures, domains + + +def get_body_hash_full(body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]: + '''Returns a lot of information about the hash (sha512) and the hits in the instance. + Also contains the data (base64 encoded)''' + details = get_indexing(flask_login.current_user).get_body_hash_urls(body_hash) + + # Break immediately if we have the hash of the empty file + if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e': + return details, BytesIO() + + # get the body from the first entry in the details list + for _, entries in details.items(): + if not entries: + continue + ct = lookyloo.get_crawled_tree(entries[0]['capture']) + try: + urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode']) + except Exception: + # Unable to find URLnode in the tree, it probably has been rebuild. + # TODO throw a log line or something + # self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]') + # lookyloo._captures_index.remove_pickle() + continue + + # From that point, we just try to get the content. Break as soon as we found one. + if urlnode.body_hash == body_hash: + # the hash we're looking for is the whole file + return details, urlnode.body + else: + # The hash is an embedded resource + for _, blobs in urlnode.embedded_ressources.items(): + for h, b in blobs: + if h == body_hash: + return details, b + + # TODO: Couldn't find the file anywhere. Maybe return a warning in the file? + return details, BytesIO() + + +def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]: + ct = lookyloo.get_crawled_tree(capture_uuid) + to_return: dict[str, dict[str, URLNode | int]] = defaultdict() + for node in ct.root_hartree.url_tree.traverse(): + if node.empty_response or node.body_hash in to_return: + # If we have the same hash more than once, skip + continue + total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(node.body_hash, limit=-1) + # Note for future: mayeb get url, capture title, something better than just the hash to show to the user + to_return[node.body_hash] = {'node': node, 'total_captures': total_captures} + return to_return + + +def get_latest_url_capture(url: str, /) -> CaptureCache | None: + '''Get the most recent capture with this URL''' + captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url)) + if captures: + return captures[0] + return None + + +def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: + '''Get the most recent captures and URL nodes where the URL has been seen.''' + captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url), cached_captures_only=cached_captures_only) + + to_return: list[dict[str, Any]] = [] + for capture in captures[:limit]: + ct = lookyloo.get_crawled_tree(capture.uuid) + to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid, + 'start_timestamp': capture.timestamp.isoformat(), + 'title': capture.title} + urlnodes: dict[str, dict[str, str]] = {} + for urlnode in ct.root_hartree.url_tree.search_nodes(name=url): + urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), + 'hostnode_uuid': urlnode.hostnode_uuid} + if hasattr(urlnode, 'body_hash'): + urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash + to_append['urlnodes'] = urlnodes + to_return.append(to_append) + return to_return + + +def get_hostname_occurrences(hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: + '''Get the most recent captures and URL nodes where the hostname has been seen.''' + captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_hostname(hostname), cached_captures_only=cached_captures_only) + + to_return: list[dict[str, Any]] = [] + for capture in captures[:limit]: + ct = lookyloo.get_crawled_tree(capture.uuid) + to_append: dict[str, str | list[Any] | dict[str, Any]] = { + 'capture_uuid': capture.uuid, + 'start_timestamp': capture.timestamp.isoformat(), + 'title': capture.title} + hostnodes: list[str] = [] + if with_urls_occurrences: + urlnodes: dict[str, dict[str, str]] = {} + for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname): + hostnodes.append(hostnode.uuid) + if with_urls_occurrences: + for urlnode in hostnode.urls: + urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), + 'url': urlnode.name, + 'hostnode_uuid': urlnode.hostnode_uuid} + if hasattr(urlnode, 'body_hash'): + urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash + to_append['hostnodes'] = hostnodes + if with_urls_occurrences: + to_append['urlnodes'] = urlnodes + to_return.append(to_append) + return to_return + + +def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]: + '''Returns all the captures related to a cookie name entry, used in the web interface.''' + cached_captures = lookyloo.sorted_capture_cache([entry[0] for entry in get_indexing(flask_login.current_user).get_cookies_names_captures(cookie_name)]) + captures = [(cache.uuid, cache.title) for cache in cached_captures] + domains = [(domain, freq, get_indexing(flask_login.current_user).cookies_names_domains_values(cookie_name, domain)) + for domain, freq in get_indexing(flask_login.current_user).get_cookie_domains(cookie_name)] + return captures, domains + + +def get_favicon_investigator(favicon_sha512: str, + /, + get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]], + tuple[str, str, str], + dict[str, dict[str, dict[str, tuple[str, str]]]]]: + '''Returns all the captures related to a cookie name entry, used in the web interface.''' + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512)]) + captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] + favicon = get_indexing(flask_login.current_user).get_favicon(favicon_sha512) + if favicon: + mimetype = from_string(favicon, mime=True) + b64_favicon = base64.b64encode(favicon).decode() + mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon) + else: + mimetype = '' + b64_favicon = '' + mmh3_shodan = '' + + # For now, there is only one probabilistic hash algo for favicons, keeping it simple + probabilistic_hash_algos = ['mmh3-shodan'] + probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {} + if get_probabilistic: + for algo in probabilistic_hash_algos: + probabilistic_favicons[algo] = {} + for mm3hash in get_indexing(flask_login.current_user).get_probabilistic_hashes_favicon(algo, favicon_sha512): + probabilistic_favicons[algo][mm3hash] = {} + for sha512 in get_indexing(flask_login.current_user).get_hashes_favicon_probablistic(algo, mm3hash): + if sha512 == favicon_sha512: + # Skip entry if it is the same as the favicon we are investigating + continue + favicon = get_indexing(flask_login.current_user).get_favicon(sha512) + if favicon: + mimetype = from_string(favicon, mime=True) + b64_favicon = base64.b64encode(favicon).decode() + probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon) + if not probabilistic_favicons[algo][mm3hash]: + # remove entry if it has no favicon + probabilistic_favicons[algo].pop(mm3hash) + if not probabilistic_favicons[algo]: + # remove entry if it has no hash + probabilistic_favicons.pop(algo) + return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons + + +def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]: + '''Returns all the captures related to a cookie name entry, used in the web interface.''' + all_captures = dict(get_indexing(flask_login.current_user).get_http_headers_hashes_captures(hhh)) + if cached_captures := lookyloo.sorted_capture_cache([entry for entry in all_captures]): + captures = [] + for cache in cached_captures: + try: + urlnode = lookyloo.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid]) + except Exception: + # NOTE: print a logline + # logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.') + lookyloo._captures_index.remove_pickle(cache.uuid) + continue + captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title)) + # get the headers and format them as they were in the response + urlnode = lookyloo.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid]) + headers = [(header["name"], header["value"]) for header in urlnode.response['headers']] + return captures, headers + return [], [] + + +def hash_lookup(blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]: + '''Search all the captures a specific hash was seen. + If a URL is given, it splits the results if the hash is seen on the same URL or an other one. + Capture UUID avoids duplicates on the same capture''' + captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} + total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1, + prefered_uuids=set(lookyloo._captures_index.keys())) + for h_capture_uuid, url_uuid, url_hostname, same_url, url in details: + cache = lookyloo.capture_cache(h_capture_uuid) + if cache and hasattr(cache, 'title'): + if same_url: + captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) + else: + captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) + # Sort by timestamp by default + captures_list['same_url'].sort(key=lambda y: y[3]) + captures_list['different_url'].sort(key=lambda y: y[3]) + return total_captures, captures_list + + +def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[HostNode, list[dict[str, Any]]]: + '''Gather all the informations needed to display the Hostnode investigator popup.''' + + def normalize_known_content(h: str, /, known_content: dict[str, Any], url: URLNode) -> tuple[str | list[Any] | None, tuple[bool, Any] | None]: + ''' There are a few different sources to figure out known vs. legitimate content, + this method normalize it for the web interface.''' + known: str | list[Any] | None = None + legitimate: tuple[bool, Any] | None = None + if h not in known_content: + return known, legitimate + + if known_content[h]['type'] in ['generic', 'sanejs']: + known = known_content[h]['details'] + elif known_content[h]['type'] == 'legitimate_on_domain': + legit = False + if url.hostname in known_content[h]['details']: + legit = True + legitimate = (legit, known_content[h]['details']) + elif known_content[h]['type'] == 'malicious': + legitimate = (False, known_content[h]['details']) + + return known, legitimate + + ct = lookyloo.get_crawled_tree(capture_uuid) + hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid) + + known_content = lookyloo.context.find_known_content(hostnode) + lookyloo.uwhois.query_whois_hostnode(hostnode) + + urls: list[dict[str, Any]] = [] + for url in hostnode.urls: + # For the popup, we need: + # * https vs http + # * everything after the domain + # * the full URL + to_append: dict[str, Any] = { + 'encrypted': url.name.startswith('https'), + 'url_path': url.name.split('/', 3)[-1], + 'url_object': url, + } + + if not url.empty_response: + # Index lookup + # %%% Full body %%% + freq = get_indexing(flask_login.current_user).body_hash_fequency(url.body_hash) + to_append['body_hash_details'] = freq + if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1: + to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid) + + # %%% Embedded ressources %%% + if hasattr(url, 'embedded_ressources') and url.embedded_ressources: + to_append['embedded_ressources'] = {} + for mimetype, blobs in url.embedded_ressources.items(): + for h, blob in blobs: + if h in to_append['embedded_ressources']: + # Skip duplicates + continue + freq_embedded = get_indexing(flask_login.current_user).body_hash_fequency(h) + to_append['embedded_ressources'][h] = freq_embedded + to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes + to_append['embedded_ressources'][h]['type'] = mimetype + if freq_embedded['hash_freq'] > 1: + to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid) + for h in to_append['embedded_ressources'].keys(): + known, legitimate = normalize_known_content(h, known_content, url) + if known: + to_append['embedded_ressources'][h]['known_content'] = known + elif legitimate: + to_append['embedded_ressources'][h]['legitimacy'] = legitimate + + known, legitimate = normalize_known_content(url.body_hash, known_content, url) + if known: + to_append['known_content'] = known + elif legitimate: + to_append['legitimacy'] = legitimate + + # Optional: Cookies sent to server in request -> map to nodes who set the cookie in response + if hasattr(url, 'cookies_sent'): + to_display_sent: dict[str, set[Iterable[str | None]]] = defaultdict(set) + for cookie, contexts in url.cookies_sent.items(): + if not contexts: + # Locally created? + to_display_sent[cookie].add(('Unknown origin', )) + continue + for context in contexts: + to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid)) + to_append['cookies_sent'] = to_display_sent + + # Optional: Cookies received from server in response -> map to nodes who send the cookie in request + if hasattr(url, 'cookies_received'): + to_display_received: dict[str, dict[str, set[Iterable[str | None]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)} + for domain, c_received, is_3rd_party in url.cookies_received: + if c_received not in ct.root_hartree.cookies_sent: + # This cookie is never sent. + if is_3rd_party: + to_display_received['3rd_party'][c_received].add((domain, )) + else: + to_display_received['not_sent'][c_received].add((domain, )) + continue + + for url_node in ct.root_hartree.cookies_sent[c_received]: + if is_3rd_party: + to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid)) + else: + to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid)) + to_append['cookies_received'] = to_display_received + + urls.append(to_append) + return hostnode, urls + + # ##### Hostnode level methods ##### @app.route('/tree//host//hashes', methods=['GET']) @@ -283,7 +633,7 @@ def urls_hostnode(tree_uuid: str, node_uuid: str) -> Response: @app.route('/tree//host/', methods=['GET']) def hostnode_popup(tree_uuid: str, node_uuid: str) -> str | WerkzeugResponse | Response: try: - hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid) + hostnode, urls = get_hostnode_investigator(tree_uuid, node_uuid) except IndexError: return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.') @@ -850,8 +1200,8 @@ def tree_favicons(tree_uuid: str) -> str: continue mimetype = from_string(favicon, mime=True) favicon_sha512 = hashlib.sha512(favicon).hexdigest() - frequency = lookyloo.indexing.favicon_frequency(favicon_sha512) - number_captures = lookyloo.indexing.favicon_number_captures(favicon_sha512) + frequency = get_indexing(flask_login.current_user).favicon_frequency(favicon_sha512) + number_captures = get_indexing(flask_login.current_user).favicon_number_captures(favicon_sha512) b64_favicon = base64.b64encode(favicon).decode() mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon) favicons.append((favicon_sha512, frequency, number_captures, mimetype, b64_favicon, mmh3_shodan)) @@ -860,7 +1210,7 @@ def tree_favicons(tree_uuid: str) -> str: @app.route('/tree//body_hashes', methods=['GET']) def tree_body_hashes(tree_uuid: str) -> str: - body_hashes = lookyloo.get_all_body_hashes(tree_uuid) + body_hashes = get_all_body_hashes(tree_uuid) return render_template('tree_body_hashes.html', tree_uuid=tree_uuid, body_hashes=body_hashes) @@ -958,27 +1308,27 @@ def index_hidden() -> str: @app.route('/cookies', methods=['GET']) def cookies_lookup() -> str: - cookies_names = [(name, freq, lookyloo.indexing.cookies_names_number_domains(name)) - for name, freq in lookyloo.indexing.cookies_names] + cookies_names = [(name, freq, get_indexing(flask_login.current_user).cookies_names_number_domains(name)) + for name, freq in get_indexing(flask_login.current_user).cookies_names] return render_template('cookies.html', cookies_names=cookies_names) @app.route('/hhhashes', methods=['GET']) def hhhashes_lookup() -> str: - hhhashes = [(hhh, freq, lookyloo.indexing.http_headers_hashes_number_captures(hhh)) - for hhh, freq in lookyloo.indexing.http_headers_hashes] + hhhashes = [(hhh, freq, get_indexing(flask_login.current_user).http_headers_hashes_number_captures(hhh)) + for hhh, freq in get_indexing(flask_login.current_user).http_headers_hashes] return render_template('hhhashes.html', hhhashes=hhhashes) @app.route('/favicons', methods=['GET']) def favicons_lookup() -> str: favicons = [] - for sha512, freq in lookyloo.indexing.favicons: - favicon = lookyloo.indexing.get_favicon(sha512) + for sha512, freq in get_indexing(flask_login.current_user).favicons: + favicon = get_indexing(flask_login.current_user).get_favicon(sha512) if not favicon: continue favicon_b64 = base64.b64encode(favicon).decode() - nb_captures = lookyloo.indexing.favicon_number_captures(sha512) + nb_captures = get_indexing(flask_login.current_user).favicon_number_captures(sha512) favicons.append((sha512, freq, nb_captures, favicon_b64)) return render_template('favicons.html', favicons=favicons) @@ -986,10 +1336,10 @@ def favicons_lookup() -> str: @app.route('/ressources', methods=['GET']) def ressources() -> str: ressources = [] - for h, freq in lookyloo.indexing.ressources: - domain_freq = lookyloo.indexing.ressources_number_domains(h) + for h, freq in get_indexing(flask_login.current_user).ressources: + domain_freq = get_indexing(flask_login.current_user).ressources_number_domains(h) context = lookyloo.context.find_known_content(h) - capture_uuid, url_uuid, hostnode_uuid = lookyloo.indexing.get_hash_uuids(h) + capture_uuid, url_uuid, hostnode_uuid = get_indexing(flask_login.current_user).get_hash_uuids(h) try: ressource = lookyloo.get_ressource(capture_uuid, url_uuid, h) except MissingUUID: @@ -1003,7 +1353,7 @@ def ressources() -> str: @app.route('/categories', methods=['GET']) def categories() -> str: - return render_template('categories.html', categories=lookyloo.indexing.categories) + return render_template('categories.html', categories=get_indexing(flask_login.current_user).categories) @app.route('/rebuild_all') @@ -1057,7 +1407,7 @@ def recapture(tree_uuid: str) -> str | Response | WerkzeugResponse: @app.route('/ressource_by_hash/', methods=['GET']) @file_response # type: ignore[misc] def ressource_by_hash(sha512: str) -> Response: - details, body = lookyloo.get_body_hash_full(sha512) + details, body = get_body_hash_full(sha512) return send_file(body, as_attachment=True, download_name='ressource.bin') @@ -1245,13 +1595,13 @@ def capture_web() -> str | Response | WerkzeugResponse: @app.route('/cookies/', methods=['GET']) def cookies_name_detail(cookie_name: str) -> str: - captures, domains = lookyloo.get_cookie_name_investigator(cookie_name.strip()) + captures, domains = get_cookie_name_investigator(cookie_name.strip()) return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures) @app.route('/hhhdetails/', methods=['GET']) def hhh_detail(hhh: str) -> str: - captures, headers = lookyloo.get_hhh_investigator(hhh.strip()) + captures, headers = get_hhh_investigator(hhh.strip()) return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers) @@ -1259,7 +1609,7 @@ def hhh_detail(hhh: str) -> str: @app.route('/favicon_details//', methods=['GET']) def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str: _get_prob = bool(get_probabilistic) - captures, favicon, probabilistic_favicons = lookyloo.get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob) + captures, favicon, probabilistic_favicons = get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob) mimetype, b64_favicon, mmh3_shodan = favicon return render_template('favicon_details.html', favicon_sha512=favicon_sha512, captures=captures, mimetype=mimetype, b64_favicon=b64_favicon, mmh3_shodan=mmh3_shodan, @@ -1269,20 +1619,20 @@ def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str: @app.route('/body_hashes/', methods=['GET']) def body_hash_details(body_hash: str) -> str: from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False - captures, domains = lookyloo.get_body_hash_investigator(body_hash.strip()) + captures, domains = _get_body_hash_investigator(body_hash.strip()) return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures, from_popup=from_popup) @app.route('/urls/', methods=['GET']) def url_details(url: str) -> str: url = unquote_plus(url).strip() - hits = lookyloo.get_url_occurrences(url, limit=50) + hits = get_url_occurrences(url, limit=50) return render_template('url.html', url=url, hits=hits) @app.route('/hostnames/', methods=['GET']) def hostname_details(hostname: str) -> str: - hits = lookyloo.get_hostname_occurrences(hostname.strip(), with_urls_occurrences=True, limit=50) + hits = get_hostname_occurrences(hostname.strip(), with_urls_occurrences=True, limit=50) return render_template('hostname.html', hostname=hostname, hits=hits) diff --git a/website/web/genericapi.py b/website/web/genericapi.py index bc29479..e7a5deb 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -287,7 +287,9 @@ class TriggerModules(Resource): # type: ignore[misc] params={'h': 'The hash (sha512)'}) class HashInfo(Resource): # type: ignore[misc] def get(self, h: str) -> dict[str, Any] | tuple[dict[str, Any], int]: - details, body = lookyloo.get_body_hash_full(h) + from . import get_body_hash_full + + details, body = get_body_hash_full(h) if not details: return {'error': 'Unknown Hash.'}, 400 to_return: dict[str, Any] = {'response': {'hash': h, 'details': details, @@ -308,8 +310,9 @@ class URLInfo(Resource): # type: ignore[misc] @api.doc(body=url_info_fields) # type: ignore[misc] def post(self) -> list[dict[str, Any]]: + from . import get_url_occurrences to_query: dict[str, Any] = request.get_json(force=True) - occurrences = lookyloo.get_url_occurrences(to_query.pop('url'), **to_query) + occurrences = get_url_occurrences(to_query.pop('url'), **to_query) return occurrences @@ -326,8 +329,9 @@ class HostnameInfo(Resource): # type: ignore[misc] @api.doc(body=hostname_info_fields) # type: ignore[misc] def post(self) -> list[dict[str, Any]]: + from . import get_hostname_occurrences to_query: dict[str, Any] = request.get_json(force=True) - return lookyloo.get_hostname_occurrences(to_query.pop('hostname'), **to_query) + return get_hostname_occurrences(to_query.pop('hostname'), **to_query) @api.route('/json/stats')