mirror of https://github.com/CIRCL/lookyloo
new: Indexer for *all* the captures
parent
2bbd35c0b8
commit
e45b7c4346
|
@ -5,15 +5,10 @@ from __future__ import annotations
|
|||
import logging
|
||||
import logging.config
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
from lookyloo import Lookyloo
|
||||
from lookyloo import Lookyloo, Indexing
|
||||
from lookyloo.default import AbstractManager, get_config
|
||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list
|
||||
from lookyloo.exceptions import NoValidHarFile
|
||||
|
||||
|
||||
logging.config.dictConfig(get_config('logging'))
|
||||
|
@ -21,125 +16,39 @@ logging.config.dictConfig(get_config('logging'))
|
|||
|
||||
class BackgroundIndexer(AbstractManager):
|
||||
|
||||
def __init__(self, loglevel: int | None=None):
|
||||
def __init__(self, full: bool=False, loglevel: int | None=None):
|
||||
super().__init__(loglevel)
|
||||
self.lookyloo = Lookyloo()
|
||||
self.script_name = 'background_indexer'
|
||||
self.full_indexer = full
|
||||
self.indexing = Indexing(full_index=self.full_indexer)
|
||||
if self.full_indexer:
|
||||
self.script_name = 'background_full_indexer'
|
||||
else:
|
||||
self.script_name = 'background_indexer'
|
||||
# make sure discarded captures dir exists
|
||||
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
||||
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _to_run_forever(self) -> None:
|
||||
all_done = self._build_missing_pickles()
|
||||
if all_done:
|
||||
self._check_indexes()
|
||||
# Disable probabilistic indexing for now, mmh3 isn't a fuzzy hash ago.
|
||||
# self._check_probabilistic_indexes()
|
||||
self._check_indexes()
|
||||
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
||||
|
||||
def _build_missing_pickles(self) -> bool:
|
||||
self.logger.debug('Build missing pickles...')
|
||||
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
||||
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
||||
max_captures = 50
|
||||
got_new_captures = False
|
||||
|
||||
# Initialize time where we do not want to build the pickles anymore.
|
||||
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
||||
cut_time = (datetime.now() - archive_interval)
|
||||
for month_dir in make_dirs_list(self.lookyloo.capture_dir):
|
||||
__counter_shutdown = 0
|
||||
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
|
||||
__counter_shutdown += 1
|
||||
if __counter_shutdown % 10 and self.shutdown_requested():
|
||||
self.logger.warning('Shutdown requested, breaking.')
|
||||
return False
|
||||
if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
|
||||
# We already have a pickle file
|
||||
self.logger.debug(f'{path} has a pickle.')
|
||||
continue
|
||||
if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
|
||||
# No HAR file
|
||||
self.logger.debug(f'{path} has no HAR file.')
|
||||
continue
|
||||
|
||||
if is_locked(path):
|
||||
# it is really locked
|
||||
self.logger.debug(f'{path} is locked, pickle generated by another process.')
|
||||
continue
|
||||
|
||||
with (path / 'uuid').open() as f:
|
||||
uuid = f.read()
|
||||
|
||||
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
||||
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
|
||||
else:
|
||||
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
|
||||
if cached_path != path:
|
||||
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
||||
if cached_path.exists():
|
||||
# Both paths exist, move the one that isn't in lookup_dirs
|
||||
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
|
||||
try:
|
||||
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
|
||||
except FileNotFoundError as e:
|
||||
self.logger.warning(f'Unable to move capture: {e}')
|
||||
continue
|
||||
else:
|
||||
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
|
||||
|
||||
try:
|
||||
self.logger.info(f'Build pickle for {uuid}: {path.name}')
|
||||
self.lookyloo.get_crawled_tree(uuid)
|
||||
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
||||
self.logger.info(f'Pickle for {uuid} built.')
|
||||
got_new_captures = True
|
||||
max_captures -= 1
|
||||
except MissingUUID:
|
||||
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||
except NoValidHarFile as e:
|
||||
self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
|
||||
except FileNotFoundError:
|
||||
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
|
||||
except Exception:
|
||||
self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
|
||||
# The capture is not working, moving it away.
|
||||
try:
|
||||
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
|
||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||
except FileNotFoundError as e:
|
||||
self.logger.warning(f'Unable to move capture: {e}')
|
||||
continue
|
||||
if max_captures <= 0:
|
||||
self.logger.info('Too many captures in the backlog, start from the beginning.')
|
||||
return False
|
||||
if got_new_captures:
|
||||
self.logger.info('Finished building all missing pickles.')
|
||||
# Only return True if we built new pickles.
|
||||
return True
|
||||
return False
|
||||
|
||||
def _check_indexes(self) -> None:
|
||||
index_redis = self.lookyloo.indexing.redis
|
||||
can_index = index_redis.set('ongoing_indexing', 1, ex=3600, nx=True)
|
||||
if not can_index:
|
||||
if not self.indexing.can_index:
|
||||
# There is no reason to run this method in multiple scripts.
|
||||
self.logger.info('Indexing already ongoing in another process.')
|
||||
return None
|
||||
self.logger.info('Check indexes...')
|
||||
self.logger.info(f'Check {self.script_name}...')
|
||||
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
||||
if self.lookyloo.is_public_instance and cache.no_index:
|
||||
# Capture unindexed
|
||||
if not self.full_indexer:
|
||||
# If we're not running the full indexer, check if the capture should be indexed.
|
||||
if self.lookyloo.is_public_instance and cache.no_index:
|
||||
# Capture unindexed
|
||||
continue
|
||||
if not cache.tree_ready:
|
||||
# pickle isn't ready, we can't index.
|
||||
continue
|
||||
p = index_redis.pipeline()
|
||||
p.sismember('indexed_urls', cache.uuid)
|
||||
p.sismember('indexed_body_hashes', cache.uuid)
|
||||
p.sismember('indexed_cookies', cache.uuid)
|
||||
p.sismember('indexed_hhhashes', cache.uuid)
|
||||
p.sismember('indexed_favicons', cache.uuid)
|
||||
indexed = p.execute()
|
||||
indexed = self.indexing.capture_indexed(cache.uuid)
|
||||
if all(indexed):
|
||||
continue
|
||||
try:
|
||||
|
@ -151,50 +60,23 @@ class BackgroundIndexer(AbstractManager):
|
|||
|
||||
if not indexed[0]:
|
||||
self.logger.info(f'Indexing urls for {cache.uuid}')
|
||||
self.lookyloo.indexing.index_url_capture(ct)
|
||||
self.indexing.index_url_capture(ct)
|
||||
if not indexed[1]:
|
||||
self.logger.info(f'Indexing resources for {cache.uuid}')
|
||||
self.lookyloo.indexing.index_body_hashes_capture(ct)
|
||||
self.indexing.index_body_hashes_capture(ct)
|
||||
if not indexed[2]:
|
||||
self.logger.info(f'Indexing cookies for {cache.uuid}')
|
||||
self.lookyloo.indexing.index_cookies_capture(ct)
|
||||
self.indexing.index_cookies_capture(ct)
|
||||
if not indexed[3]:
|
||||
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
||||
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
|
||||
self.indexing.index_http_headers_hashes_capture(ct)
|
||||
if not indexed[4]:
|
||||
self.logger.info(f'Indexing favicons for {cache.uuid}')
|
||||
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
||||
self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons)
|
||||
self.indexing.index_favicons_capture(cache.uuid, favicons)
|
||||
# NOTE: categories aren't taken in account here, should be fixed(?)
|
||||
# see indexing.index_categories_capture(capture_uuid, categories)
|
||||
index_redis.delete('ongoing_indexing')
|
||||
self.logger.info('... done.')
|
||||
|
||||
def _check_probabilistic_indexes(self) -> None:
|
||||
index_redis = self.lookyloo.indexing.redis
|
||||
can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
|
||||
if not can_index:
|
||||
# There is no reason to run this method in multiple scripts.
|
||||
self.logger.info('Probalistic indexing already ongoing in another process.')
|
||||
return None
|
||||
self.logger.info('Check probabilistic indexes...')
|
||||
algorithms = ['mmh3-shodan']
|
||||
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
||||
if self.lookyloo.is_public_instance and cache.no_index:
|
||||
# Capture unindexed
|
||||
continue
|
||||
p = index_redis.pipeline()
|
||||
for algorithm in algorithms:
|
||||
p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
|
||||
indexed = p.execute()
|
||||
if all(indexed):
|
||||
continue
|
||||
for i, algorithm in enumerate(algorithms):
|
||||
if not indexed[i]:
|
||||
self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
|
||||
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
||||
self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
|
||||
index_redis.delete('ongoing_probalistic_indexing')
|
||||
self.indexing.indexing_done()
|
||||
self.logger.info('... done.')
|
||||
|
||||
|
||||
|
@ -203,5 +85,12 @@ def main() -> None:
|
|||
i.run(sleep_in_sec=60)
|
||||
|
||||
|
||||
def main_full_indexer() -> None:
|
||||
if not get_config('generic', 'index_everything'):
|
||||
raise Exception('Full indexer is disabled.')
|
||||
i = BackgroundIndexer(full=True)
|
||||
i.run(sleep_in_sec=60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -11,7 +11,7 @@ from subprocess import Popen
|
|||
from redis import Redis
|
||||
from redis.exceptions import ConnectionError
|
||||
|
||||
from lookyloo.default import get_homedir, get_socket_path
|
||||
from lookyloo.default import get_homedir, get_socket_path, get_config
|
||||
|
||||
|
||||
def check_running(name: str) -> bool:
|
||||
|
@ -55,13 +55,32 @@ def shutdown_indexing(storage_directory: Path | None=None) -> None:
|
|||
print('Redis indexing database shutdown.')
|
||||
|
||||
|
||||
def launch_full_index(storage_directory: Path | None=None) -> None:
|
||||
if not storage_directory:
|
||||
storage_directory = get_homedir()
|
||||
if not check_running('full_index'):
|
||||
Popen(["./run_kvrocks.sh"], cwd=(storage_directory / 'full_index'))
|
||||
|
||||
|
||||
def shutdown_full_index(storage_directory: Path | None=None) -> None:
|
||||
if not storage_directory:
|
||||
storage_directory = get_homedir()
|
||||
r = Redis(unix_socket_path=get_socket_path('full_index'))
|
||||
r.shutdown(save=True)
|
||||
print('Kvrocks full indexing database shutdown.')
|
||||
|
||||
|
||||
def launch_all() -> None:
|
||||
launch_cache()
|
||||
launch_indexing()
|
||||
if get_config('generic', 'index_everything'):
|
||||
launch_full_index()
|
||||
|
||||
|
||||
def check_all(stop: bool=False) -> None:
|
||||
backends: dict[str, bool] = {'cache': False, 'indexing': False}
|
||||
if get_config('generic', 'index_everything'):
|
||||
backends['full_index'] = False
|
||||
while True:
|
||||
for db_name in backends.keys():
|
||||
try:
|
||||
|
@ -85,6 +104,8 @@ def check_all(stop: bool=False) -> None:
|
|||
def stop_all() -> None:
|
||||
shutdown_cache()
|
||||
shutdown_indexing()
|
||||
if get_config('generic', 'index_everything'):
|
||||
shutdown_full_index()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
from subprocess import Popen, run
|
||||
|
||||
from lookyloo.default import get_homedir
|
||||
from lookyloo.default import get_homedir, get_config
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
@ -18,9 +18,16 @@ def main() -> None:
|
|||
print('Start asynchronous ingestor...')
|
||||
Popen(['async_capture'])
|
||||
print('done.')
|
||||
print('Start background capture builder...')
|
||||
Popen(['background_build_captures'])
|
||||
print('done.')
|
||||
print('Start background indexer...')
|
||||
Popen(['background_indexer'])
|
||||
print('done.')
|
||||
if get_config('generic', 'index_everything'):
|
||||
print('Start background full indexer...')
|
||||
Popen(['background_full_indexer'])
|
||||
print('done.')
|
||||
print('Start background processing...')
|
||||
Popen(['processing'])
|
||||
print('done.')
|
||||
|
|
|
@ -79,6 +79,7 @@
|
|||
"bucket_name": ""
|
||||
}
|
||||
},
|
||||
"index_everything": false,
|
||||
"_notes": {
|
||||
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
||||
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
|
||||
|
@ -110,6 +111,7 @@
|
|||
"archive": "The captures older than this value (in days) will be archived. They're not cached by default in the Lookyloo class.",
|
||||
"max_capture_time": "The very maximal time we allow a capture to keep going. Should only be triggered by captures that cause playwright to never quit.",
|
||||
"max_tree_create_time": "The max time the generation of a tree is allowed to take",
|
||||
"s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage."
|
||||
"s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage.",
|
||||
"index_everything": "If true, index every capture, even if it's not public. This feature requires a dedicated kvrocks instance, and is only accessible when logged-in as admin."
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,875 @@
|
|||
################################ GENERAL #####################################
|
||||
|
||||
# By default kvrocks listens for connections from localhost interface.
|
||||
# It is possible to listen to just one or multiple interfaces using
|
||||
# the "bind" configuration directive, followed by one or more IP addresses.
|
||||
#
|
||||
# Examples:
|
||||
#
|
||||
# bind 192.168.1.100 10.0.0.1
|
||||
# bind 127.0.0.1 ::1
|
||||
# bind 0.0.0.0
|
||||
# bind 127.0.0.1
|
||||
|
||||
# Unix socket.
|
||||
#
|
||||
# Specify the path for the unix socket that will be used to listen for
|
||||
# incoming connections. There is no default, so kvrocks will not listen
|
||||
# on a unix socket when not specified.
|
||||
#
|
||||
unixsocket full_index.sock
|
||||
unixsocketperm 777
|
||||
|
||||
# Accept connections on the specified port, default is 6666.
|
||||
# port 6666
|
||||
|
||||
# Close the connection after a client is idle for N seconds (0 to disable)
|
||||
timeout 0
|
||||
|
||||
# The number of worker's threads, increase or decrease would affect the performance.
|
||||
workers 8
|
||||
|
||||
# By default, kvrocks does not run as a daemon. Use 'yes' if you need it.
|
||||
# Note that kvrocks will write a PID file in /var/run/kvrocks.pid when daemonized
|
||||
daemonize yes
|
||||
|
||||
# Kvrocks implements the cluster solution that is similar to the Redis cluster solution.
|
||||
# You can get cluster information by CLUSTER NODES|SLOTS|INFO command, it also is
|
||||
# adapted to redis-cli, redis-benchmark, Redis cluster SDK, and Redis cluster proxy.
|
||||
# But kvrocks doesn't support communicating with each other, so you must set
|
||||
# cluster topology by CLUSTER SETNODES|SETNODEID commands, more details: #219.
|
||||
#
|
||||
# PLEASE NOTE:
|
||||
# If you enable cluster, kvrocks will encode key with its slot id calculated by
|
||||
# CRC16 and modulo 16384, encoding key with its slot id makes it efficient to
|
||||
# migrate keys based on the slot. So if you enabled at first time, cluster mode must
|
||||
# not be disabled after restarting, and vice versa. That is to say, data is not
|
||||
# compatible between standalone mode with cluster mode, you must migrate data
|
||||
# if you want to change mode, otherwise, kvrocks will make data corrupt.
|
||||
#
|
||||
# Default: no
|
||||
|
||||
cluster-enabled no
|
||||
|
||||
# By default, namespaces are stored in the configuration file and won't be replicated
|
||||
# to replicas. This option allows to change this behavior, so that namespaces are also
|
||||
# propagated to slaves. Note that:
|
||||
# 1) it won't replicate the 'masterauth' to prevent breaking master/replica replication
|
||||
# 2) it will overwrite replica's namespace with master's namespace, so be careful of in-using namespaces
|
||||
# 3) cannot switch off the namespace replication once it's enabled
|
||||
#
|
||||
# Default: no
|
||||
repl-namespace-enabled no
|
||||
|
||||
# Persist the cluster nodes topology in local file($dir/nodes.conf). This configuration
|
||||
# takes effect only if the cluster mode was enabled.
|
||||
#
|
||||
# If yes, it will try to load the cluster topology from the local file when starting,
|
||||
# and dump the cluster nodes into the file if it was changed.
|
||||
#
|
||||
# Default: yes
|
||||
persist-cluster-nodes-enabled yes
|
||||
|
||||
# Set the max number of connected clients at the same time. By default
|
||||
# this limit is set to 10000 clients. However, if the server is not
|
||||
# able to configure the process file limit to allow for the specified limit
|
||||
# the max number of allowed clients is set to the current file limit
|
||||
#
|
||||
# Once the limit is reached the server will close all the new connections sending
|
||||
# an error 'max number of clients reached'.
|
||||
#
|
||||
maxclients 10000
|
||||
|
||||
# Require clients to issue AUTH <PASSWORD> before processing any other
|
||||
# commands. This might be useful in environments in which you do not trust
|
||||
# others with access to the host running kvrocks.
|
||||
#
|
||||
# This should stay commented out for backward compatibility and because most
|
||||
# people do not need auth (e.g. they run their own servers).
|
||||
#
|
||||
# Warning: since kvrocks is pretty fast an outside user can try up to
|
||||
# 150k passwords per second against a good box. This means that you should
|
||||
# use a very strong password otherwise it will be very easy to break.
|
||||
#
|
||||
# requirepass foobared
|
||||
|
||||
# If the master is password protected (using the "masterauth" configuration
|
||||
# directive below) it is possible to tell the slave to authenticate before
|
||||
# starting the replication synchronization process. Otherwise, the master will
|
||||
# refuse the slave request.
|
||||
#
|
||||
# masterauth foobared
|
||||
|
||||
# Master-Salve replication would check db name is matched. if not, the slave should
|
||||
# refuse to sync the db from master. Don't use the default value, set the db-name to identify
|
||||
# the cluster.
|
||||
db-name change.me.db
|
||||
|
||||
# The working directory
|
||||
#
|
||||
# The DB will be written inside this directory
|
||||
# Note that you must specify a directory here, not a file name.
|
||||
dir ./
|
||||
|
||||
# You can configure where to store your server logs by the log-dir.
|
||||
# If you don't specify one, we will use the above `dir` as our default log directory.
|
||||
# We also can send logs to stdout/stderr is as simple as:
|
||||
#
|
||||
log-dir stdout
|
||||
|
||||
# Log level
|
||||
# Possible values: info, warning, error, fatal
|
||||
# Default: info
|
||||
log-level info
|
||||
|
||||
# You can configure log-retention-days to control whether to enable the log cleaner
|
||||
# and the maximum retention days that the INFO level logs will be kept.
|
||||
#
|
||||
# if set to -1, that means to disable the log cleaner.
|
||||
# if set to 0, all previous INFO level logs will be immediately removed.
|
||||
# if set to between 0 to INT_MAX, that means it will retent latest N(log-retention-days) day logs.
|
||||
|
||||
# By default the log-retention-days is -1.
|
||||
log-retention-days -1
|
||||
|
||||
# When running in daemonize mode, kvrocks writes a PID file in ${CONFIG_DIR}/kvrocks.pid by
|
||||
# default. You can specify a custom pid file location here.
|
||||
pidfile kvrocks.pid
|
||||
|
||||
# You can configure a slave instance to accept writes or not. Writing against
|
||||
# a slave instance may be useful to store some ephemeral data (because data
|
||||
# written on a slave will be easily deleted after resync with the master) but
|
||||
# may also cause problems if clients are writing to it because of a
|
||||
# misconfiguration.
|
||||
slave-read-only yes
|
||||
|
||||
# The slave priority is an integer number published by Kvrocks in the INFO output.
|
||||
# It is used by Redis Sentinel in order to select a slave to promote into a
|
||||
# master if the master is no longer working correctly.
|
||||
#
|
||||
# A slave with a low priority number is considered better for promotion, so
|
||||
# for instance if there are three slave with priority 10, 100, 25 Sentinel will
|
||||
# pick the one with priority 10, that is the lowest.
|
||||
#
|
||||
# However a special priority of 0 marks the replica as not able to perform the
|
||||
# role of master, so a slave with priority of 0 will never be selected by
|
||||
# Redis Sentinel for promotion.
|
||||
#
|
||||
# By default the priority is 100.
|
||||
slave-priority 100
|
||||
|
||||
# TCP listen() backlog.
|
||||
#
|
||||
# In high requests-per-second environments you need an high backlog in order
|
||||
# to avoid slow clients connections issues. Note that the Linux kernel
|
||||
# will silently truncate it to the value of /proc/sys/net/core/somaxconn so
|
||||
# make sure to raise both the value of somaxconn and tcp_max_syn_backlog
|
||||
# in order to Get the desired effect.
|
||||
tcp-backlog 511
|
||||
|
||||
# If the master is an old version, it may have specified replication threads
|
||||
# that use 'port + 1' as listening port, but in new versions, we don't use
|
||||
# extra port to implement replication. In order to allow the new replicas to
|
||||
# copy old masters, you should indicate that the master uses replication port
|
||||
# or not.
|
||||
# If yes, that indicates master uses replication port and replicas will connect
|
||||
# to 'master's listening port + 1' when synchronization.
|
||||
# If no, that indicates master doesn't use replication port and replicas will
|
||||
# connect 'master's listening port' when synchronization.
|
||||
master-use-repl-port no
|
||||
|
||||
# Currently, master only checks sequence number when replica asks for PSYNC,
|
||||
# that is not enough since they may have different replication histories even
|
||||
# the replica asking sequence is in the range of the master current WAL.
|
||||
#
|
||||
# We design 'Replication Sequence ID' PSYNC, we add unique replication id for
|
||||
# every write batch (the operation of each command on the storage engine), so
|
||||
# the combination of replication id and sequence is unique for write batch.
|
||||
# The master can identify whether the replica has the same replication history
|
||||
# by checking replication id and sequence.
|
||||
#
|
||||
# By default, it is not enabled since this stricter check may easily lead to
|
||||
# full synchronization.
|
||||
use-rsid-psync no
|
||||
|
||||
# Master-Slave replication. Use slaveof to make a kvrocks instance a copy of
|
||||
# another kvrocks server. A few things to understand ASAP about kvrocks replication.
|
||||
#
|
||||
# 1) Kvrocks replication is asynchronous, but you can configure a master to
|
||||
# stop accepting writes if it appears to be not connected with at least
|
||||
# a given number of slaves.
|
||||
# 2) Kvrocks slaves are able to perform a partial resynchronization with the
|
||||
# master if the replication link is lost for a relatively small amount of
|
||||
# time. You may want to configure the replication backlog size (see the next
|
||||
# sections of this file) with a sensible value depending on your needs.
|
||||
# 3) Replication is automatic and does not need user intervention. After a
|
||||
# network partition slaves automatically try to reconnect to masters
|
||||
# and resynchronize with them.
|
||||
#
|
||||
# slaveof <masterip> <masterport>
|
||||
# slaveof 127.0.0.1 6379
|
||||
|
||||
# When a slave loses its connection with the master, or when the replication
|
||||
# is still in progress, the slave can act in two different ways:
|
||||
#
|
||||
# 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will
|
||||
# still reply to client requests, possibly with out-of-date data, or the
|
||||
# data set may just be empty if this is the first synchronization.
|
||||
#
|
||||
# 2) if slave-serve-stale-data is set to 'no' the slave will reply with
|
||||
# an error "SYNC with master in progress" to all kinds of commands
|
||||
# but to INFO and SLAVEOF.
|
||||
#
|
||||
slave-serve-stale-data yes
|
||||
|
||||
# To guarantee slave's data safe and serve when it is in full synchronization
|
||||
# state, slave still keep itself data. But this way needs to occupy much disk
|
||||
# space, so we provide a way to reduce disk occupation, slave will delete itself
|
||||
# entire database before fetching files from master during full synchronization.
|
||||
# If you want to enable this way, you can set 'slave-delete-db-before-fullsync'
|
||||
# to yes, but you must know that database will be lost if master is down during
|
||||
# full synchronization, unless you have a backup of database.
|
||||
#
|
||||
# This option is similar redis replicas RDB diskless load option:
|
||||
# repl-diskless-load on-empty-db
|
||||
#
|
||||
# Default: no
|
||||
slave-empty-db-before-fullsync no
|
||||
|
||||
# A Kvrocks master is able to list the address and port of the attached
|
||||
# replicas in different ways. For example the "INFO replication" section
|
||||
# offers this information, which is used, among other tools, by
|
||||
# Redis Sentinel in order to discover replica instances.
|
||||
# Another place where this info is available is in the output of the
|
||||
# "ROLE" command of a master.
|
||||
#
|
||||
# The listed IP address and port normally reported by a replica is
|
||||
# obtained in the following way:
|
||||
#
|
||||
# IP: The address is auto detected by checking the peer address
|
||||
# of the socket used by the replica to connect with the master.
|
||||
#
|
||||
# Port: The port is communicated by the replica during the replication
|
||||
# handshake, and is normally the port that the replica is using to
|
||||
# listen for connections.
|
||||
#
|
||||
# However when port forwarding or Network Address Translation (NAT) is
|
||||
# used, the replica may actually be reachable via different IP and port
|
||||
# pairs. The following two options can be used by a replica in order to
|
||||
# report to its master a specific set of IP and port, so that both INFO
|
||||
# and ROLE will report those values.
|
||||
#
|
||||
# There is no need to use both the options if you need to override just
|
||||
# the port or the IP address.
|
||||
#
|
||||
# replica-announce-ip 5.5.5.5
|
||||
# replica-announce-port 1234
|
||||
|
||||
# If replicas need full synchronization with master, master need to create
|
||||
# checkpoint for feeding replicas, and replicas also stage a checkpoint of
|
||||
# the master. If we also keep the backup, it maybe occupy extra disk space.
|
||||
# You can enable 'purge-backup-on-fullsync' if disk is not sufficient, but
|
||||
# that may cause remote backup copy failing.
|
||||
#
|
||||
# Default: no
|
||||
purge-backup-on-fullsync no
|
||||
|
||||
# The maximum allowed rate (in MB/s) that should be used by replication.
|
||||
# If the rate exceeds max-replication-mb, replication will slow down.
|
||||
# Default: 0 (i.e. no limit)
|
||||
max-replication-mb 0
|
||||
|
||||
# The maximum allowed aggregated write rate of flush and compaction (in MB/s).
|
||||
# If the rate exceeds max-io-mb, io will slow down.
|
||||
# 0 is no limit
|
||||
# Default: 0
|
||||
max-io-mb 0
|
||||
|
||||
# The maximum allowed space (in GB) that should be used by RocksDB.
|
||||
# If the total size of the SST files exceeds max_allowed_space, writes to RocksDB will fail.
|
||||
# Please see: https://github.com/facebook/rocksdb/wiki/Managing-Disk-Space-Utilization
|
||||
# Default: 0 (i.e. no limit)
|
||||
max-db-size 0
|
||||
|
||||
# The maximum backup to keep, server cron would run every minutes to check the num of current
|
||||
# backup, and purge the old backup if exceed the max backup num to keep. If max-backup-to-keep
|
||||
# is 0, no backup would be kept. But now, we only support 0 or 1.
|
||||
max-backup-to-keep 1
|
||||
|
||||
# The maximum hours to keep the backup. If max-backup-keep-hours is 0, wouldn't purge any backup.
|
||||
# default: 1 day
|
||||
max-backup-keep-hours 24
|
||||
|
||||
# max-bitmap-to-string-mb use to limit the max size of bitmap to string transformation(MB).
|
||||
#
|
||||
# Default: 16
|
||||
max-bitmap-to-string-mb 16
|
||||
|
||||
# Whether to enable SCAN-like cursor compatible with Redis.
|
||||
# If enabled, the cursor will be unsigned 64-bit integers.
|
||||
# If disabled, the cursor will be a string.
|
||||
# Default: no
|
||||
redis-cursor-compatible no
|
||||
|
||||
# Maximum nesting depth allowed when parsing and serializing
|
||||
# JSON documents while using JSON commands like JSON.SET.
|
||||
# Default: 1024
|
||||
json-max-nesting-depth 1024
|
||||
|
||||
# The underlying storage format of JSON data type
|
||||
# NOTE: This option only affects newly written/updated key-values
|
||||
# The CBOR format may reduce the storage size and speed up JSON commands
|
||||
# Available values: json, cbor
|
||||
# Default: json
|
||||
json-storage-format json
|
||||
|
||||
################################## TLS ###################################
|
||||
|
||||
# By default, TLS/SSL is disabled, i.e. `tls-port` is set to 0.
|
||||
# To enable it, `tls-port` can be used to define TLS-listening ports.
|
||||
# tls-port 0
|
||||
|
||||
# Configure a X.509 certificate and private key to use for authenticating the
|
||||
# server to connected clients, masters or cluster peers.
|
||||
# These files should be PEM formatted.
|
||||
#
|
||||
# tls-cert-file kvrocks.crt
|
||||
# tls-key-file kvrocks.key
|
||||
|
||||
# If the key file is encrypted using a passphrase, it can be included here
|
||||
# as well.
|
||||
#
|
||||
# tls-key-file-pass secret
|
||||
|
||||
# Configure a CA certificate(s) bundle or directory to authenticate TLS/SSL
|
||||
# clients and peers. Kvrocks requires an explicit configuration of at least one
|
||||
# of these, and will not implicitly use the system wide configuration.
|
||||
#
|
||||
# tls-ca-cert-file ca.crt
|
||||
# tls-ca-cert-dir /etc/ssl/certs
|
||||
|
||||
# By default, clients on a TLS port are required
|
||||
# to authenticate using valid client side certificates.
|
||||
#
|
||||
# If "no" is specified, client certificates are not required and not accepted.
|
||||
# If "optional" is specified, client certificates are accepted and must be
|
||||
# valid if provided, but are not required.
|
||||
#
|
||||
# tls-auth-clients no
|
||||
# tls-auth-clients optional
|
||||
|
||||
# By default, only TLSv1.2 and TLSv1.3 are enabled and it is highly recommended
|
||||
# that older formally deprecated versions are kept disabled to reduce the attack surface.
|
||||
# You can explicitly specify TLS versions to support.
|
||||
# Allowed values are case insensitive and include "TLSv1", "TLSv1.1", "TLSv1.2",
|
||||
# "TLSv1.3" (OpenSSL >= 1.1.1) or any combination.
|
||||
# To enable only TLSv1.2 and TLSv1.3, use:
|
||||
#
|
||||
# tls-protocols "TLSv1.2 TLSv1.3"
|
||||
|
||||
# Configure allowed ciphers. See the ciphers(1ssl) manpage for more information
|
||||
# about the syntax of this string.
|
||||
#
|
||||
# Note: this configuration applies only to <= TLSv1.2.
|
||||
#
|
||||
# tls-ciphers DEFAULT:!MEDIUM
|
||||
|
||||
# Configure allowed TLSv1.3 ciphersuites. See the ciphers(1ssl) manpage for more
|
||||
# information about the syntax of this string, and specifically for TLSv1.3
|
||||
# ciphersuites.
|
||||
#
|
||||
# tls-ciphersuites TLS_CHACHA20_POLY1305_SHA256
|
||||
|
||||
# When choosing a cipher, use the server's preference instead of the client
|
||||
# preference. By default, the server follows the client's preference.
|
||||
#
|
||||
# tls-prefer-server-ciphers yes
|
||||
|
||||
# By default, TLS session caching is enabled to allow faster and less expensive
|
||||
# reconnections by clients that support it. Use the following directive to disable
|
||||
# caching.
|
||||
#
|
||||
# tls-session-caching no
|
||||
|
||||
# Change the default number of TLS sessions cached. A zero value sets the cache
|
||||
# to unlimited size. The default size is 20480.
|
||||
#
|
||||
# tls-session-cache-size 5000
|
||||
|
||||
# Change the default timeout of cached TLS sessions. The default timeout is 300
|
||||
# seconds.
|
||||
#
|
||||
# tls-session-cache-timeout 60
|
||||
|
||||
# By default, a replica does not attempt to establish a TLS connection
|
||||
# with its master.
|
||||
#
|
||||
# Use the following directive to enable TLS on replication links.
|
||||
#
|
||||
# tls-replication yes
|
||||
|
||||
################################## SLOW LOG ###################################
|
||||
|
||||
# The Kvrocks Slow Log is a mechanism to log queries that exceeded a specified
|
||||
# execution time. The execution time does not include the I/O operations
|
||||
# like talking with the client, sending the reply and so forth,
|
||||
# but just the time needed to actually execute the command (this is the only
|
||||
# stage of command execution where the thread is blocked and can not serve
|
||||
# other requests in the meantime).
|
||||
#
|
||||
# You can configure the slow log with two parameters: one tells Kvrocks
|
||||
# what is the execution time, in microseconds, to exceed in order for the
|
||||
# command to get logged, and the other parameter is the length of the
|
||||
# slow log. When a new command is logged the oldest one is removed from the
|
||||
# queue of logged commands.
|
||||
|
||||
# The following time is expressed in microseconds, so 1000000 is equivalent
|
||||
# to one second. Note that -1 value disables the slow log, while
|
||||
# a value of zero forces the logging of every command.
|
||||
slowlog-log-slower-than 100000
|
||||
|
||||
# There is no limit to this length. Just be aware that it will consume memory.
|
||||
# You can reclaim memory used by the slow log with SLOWLOG RESET.
|
||||
slowlog-max-len 128
|
||||
|
||||
# If you run kvrocks from upstart or systemd, kvrocks can interact with your
|
||||
# supervision tree. Options:
|
||||
# supervised no - no supervision interaction
|
||||
# supervised upstart - signal upstart by putting kvrocks into SIGSTOP mode
|
||||
# supervised systemd - signal systemd by writing READY=1 to $NOTIFY_SOCKET
|
||||
# supervised auto - detect upstart or systemd method based on
|
||||
# UPSTART_JOB or NOTIFY_SOCKET environment variables
|
||||
# Note: these supervision methods only signal "process is ready."
|
||||
# They do not enable continuous liveness pings back to your supervisor.
|
||||
supervised no
|
||||
|
||||
################################## PERF LOG ###################################
|
||||
|
||||
# The Kvrocks Perf Log is a mechanism to log queries' performance context that
|
||||
# exceeded a specified execution time. This mechanism uses rocksdb's
|
||||
# Perf Context and IO Stats Context, Please see:
|
||||
# https://github.com/facebook/rocksdb/wiki/Perf-Context-and-IO-Stats-Context
|
||||
#
|
||||
# This mechanism is enabled when profiling-sample-commands is not empty and
|
||||
# profiling-sample-ratio greater than 0.
|
||||
# It is important to note that this mechanism affects performance, but it is
|
||||
# useful for troubleshooting performance bottlenecks, so it should only be
|
||||
# enabled when performance problems occur.
|
||||
|
||||
# The name of the commands you want to record. Must be original name of
|
||||
# commands supported by Kvrocks. Use ',' to separate multiple commands and
|
||||
# use '*' to record all commands supported by Kvrocks.
|
||||
# Example:
|
||||
# - Single command: profiling-sample-commands get
|
||||
# - Multiple commands: profiling-sample-commands get,mget,hget
|
||||
#
|
||||
# Default: empty
|
||||
# profiling-sample-commands ""
|
||||
|
||||
# Ratio of the samples would be recorded. It is a number between 0 and 100.
|
||||
# We simply use the rand to determine whether to record the sample or not.
|
||||
#
|
||||
# Default: 0
|
||||
profiling-sample-ratio 0
|
||||
|
||||
# There is no limit to this length. Just be aware that it will consume memory.
|
||||
# You can reclaim memory used by the perf log with PERFLOG RESET.
|
||||
#
|
||||
# Default: 256
|
||||
profiling-sample-record-max-len 256
|
||||
|
||||
# profiling-sample-record-threshold-ms use to tell the kvrocks when to record.
|
||||
#
|
||||
# Default: 100 millisecond
|
||||
profiling-sample-record-threshold-ms 100
|
||||
|
||||
################################## CRON ###################################
|
||||
|
||||
# Compact Scheduler, auto compact at schedule time
|
||||
# time expression format is the same as crontab(currently only support * and int)
|
||||
# e.g. compact-cron 0 3 * * * 0 4 * * *
|
||||
# would compact the db at 3am and 4am everyday
|
||||
# compact-cron 0 3 * * *
|
||||
|
||||
# The hour range that compaction checker would be active
|
||||
# e.g. compaction-checker-range 0-7 means compaction checker would be worker between
|
||||
# 0-7am every day.
|
||||
compaction-checker-range 0-7
|
||||
|
||||
# When the compaction checker is triggered, the db will periodically pick the SST file
|
||||
# with the highest "deleted percentage" (i.e. the percentage of deleted keys in the SST
|
||||
# file) to compact, in order to free disk space.
|
||||
# However, if a specific SST file was created more than "force-compact-file-age" seconds
|
||||
# ago, and its percentage of deleted keys is higher than
|
||||
# "force-compact-file-min-deleted-percentage", it will be forcely compacted as well.
|
||||
|
||||
# Default: 172800 seconds; Range: [60, INT64_MAX];
|
||||
# force-compact-file-age 172800
|
||||
# Default: 10 %; Range: [1, 100];
|
||||
# force-compact-file-min-deleted-percentage 10
|
||||
|
||||
# Bgsave scheduler, auto bgsave at scheduled time
|
||||
# time expression format is the same as crontab(currently only support * and int)
|
||||
# e.g. bgsave-cron 0 3 * * * 0 4 * * *
|
||||
# would bgsave the db at 3am and 4am every day
|
||||
|
||||
# Command renaming.
|
||||
#
|
||||
# It is possible to change the name of dangerous commands in a shared
|
||||
# environment. For instance, the KEYS command may be renamed into something
|
||||
# hard to guess so that it will still be available for internal-use tools
|
||||
# but not available for general clients.
|
||||
#
|
||||
# Example:
|
||||
#
|
||||
# rename-command KEYS b840fc02d524045429941cc15f59e41cb7be6c52
|
||||
#
|
||||
# It is also possible to completely kill a command by renaming it into
|
||||
# an empty string:
|
||||
#
|
||||
# rename-command KEYS ""
|
||||
|
||||
################################ MIGRATE #####################################
|
||||
# If the network bandwidth is completely consumed by the migration task,
|
||||
# it will affect the availability of kvrocks. To avoid this situation,
|
||||
# migrate-speed is adopted to limit the migrating speed.
|
||||
# Migrating speed is limited by controlling the duration between sending data,
|
||||
# the duration is calculated by: 1000000 * migrate-pipeline-size / migrate-speed (us).
|
||||
# Value: [0,INT_MAX], 0 means no limit
|
||||
#
|
||||
# Default: 4096
|
||||
migrate-speed 4096
|
||||
|
||||
# In order to reduce data transmission times and improve the efficiency of data migration,
|
||||
# pipeline is adopted to send multiple data at once. Pipeline size can be set by this option.
|
||||
# Value: [1, INT_MAX], it can't be 0
|
||||
#
|
||||
# Default: 16
|
||||
migrate-pipeline-size 16
|
||||
|
||||
# In order to reduce the write forbidden time during migrating slot, we will migrate the incremental
|
||||
# data several times to reduce the amount of incremental data. Until the quantity of incremental
|
||||
# data is reduced to a certain threshold, slot will be forbidden write. The threshold is set by
|
||||
# this option.
|
||||
# Value: [1, INT_MAX], it can't be 0
|
||||
#
|
||||
# Default: 10000
|
||||
migrate-sequence-gap 10000
|
||||
|
||||
################################ ROCKSDB #####################################
|
||||
|
||||
# Specify the capacity of column family block cache. A larger block cache
|
||||
# may make requests faster while more keys would be cached. Max Size is 400*1024.
|
||||
# Default: 4096MB
|
||||
rocksdb.block_cache_size 4096
|
||||
|
||||
# A global cache for table-level rows in RocksDB. If almost always point
|
||||
# lookups, enlarging row cache may improve read performance. Otherwise,
|
||||
# if we enlarge this value, we can lessen metadata/subkey block cache size.
|
||||
#
|
||||
# Default: 0 (disabled)
|
||||
rocksdb.row_cache_size 0
|
||||
|
||||
# Number of open files that can be used by the DB. You may need to
|
||||
# increase this if your database has a large working set. Value -1 means
|
||||
# files opened are always kept open. You can estimate number of files based
|
||||
# on target_file_size_base and target_file_size_multiplier for level-based
|
||||
# compaction. For universal-style compaction, you can usually set it to -1.
|
||||
# Default: 8096
|
||||
rocksdb.max_open_files 8096
|
||||
|
||||
# Amount of data to build up in memory (backed by an unsorted log
|
||||
# on disk) before converting to a sorted on-disk file.
|
||||
#
|
||||
# Larger values increase performance, especially during bulk loads.
|
||||
# Up to max_write_buffer_number write buffers may be held in memory
|
||||
# at the same time,
|
||||
# so you may wish to adjust this parameter to control memory usage.
|
||||
# Also, a larger write buffer will result in a longer recovery time
|
||||
# the next time the database is opened.
|
||||
#
|
||||
# Note that write_buffer_size is enforced per column family.
|
||||
# See db_write_buffer_size for sharing memory across column families.
|
||||
|
||||
# default is 64MB
|
||||
rocksdb.write_buffer_size 64
|
||||
|
||||
# Target file size for compaction, target file size for Level N can be calculated
|
||||
# by target_file_size_base * (target_file_size_multiplier ^ (L-1))
|
||||
#
|
||||
# Default: 128MB
|
||||
rocksdb.target_file_size_base 128
|
||||
|
||||
# The maximum number of write buffers that are built up in memory.
|
||||
# The default and the minimum number is 2, so that when 1 write buffer
|
||||
# is being flushed to storage, new writes can continue to the other
|
||||
# write buffer.
|
||||
# If max_write_buffer_number > 3, writing will be slowed down to
|
||||
# options.delayed_write_rate if we are writing to the last write buffer
|
||||
# allowed.
|
||||
rocksdb.max_write_buffer_number 4
|
||||
|
||||
# Maximum number of concurrent background jobs (compactions and flushes).
|
||||
# For backwards compatibility we will set `max_background_jobs =
|
||||
# max_background_compactions + max_background_flushes` in the case where user
|
||||
# sets at least one of `max_background_compactions` or `max_background_flushes`
|
||||
# (we replace -1 by 1 in case one option is unset).
|
||||
rocksdb.max_background_jobs 4
|
||||
|
||||
# DEPRECATED: it is automatically decided based on the value of rocksdb.max_background_jobs
|
||||
# Maximum number of concurrent background compaction jobs, submitted to
|
||||
# the default LOW priority thread pool.
|
||||
rocksdb.max_background_compactions -1
|
||||
|
||||
# DEPRECATED: it is automatically decided based on the value of rocksdb.max_background_jobs
|
||||
# Maximum number of concurrent background memtable flush jobs, submitted by
|
||||
# default to the HIGH priority thread pool. If the HIGH priority thread pool
|
||||
# is configured to have zero threads, flush jobs will share the LOW priority
|
||||
# thread pool with compaction jobs.
|
||||
rocksdb.max_background_flushes -1
|
||||
|
||||
# This value represents the maximum number of threads that will
|
||||
# concurrently perform a compaction job by breaking it into multiple,
|
||||
# smaller ones that are run simultaneously.
|
||||
# Default: 2
|
||||
rocksdb.max_sub_compactions 2
|
||||
|
||||
# In order to limit the size of WALs, RocksDB uses DBOptions::max_total_wal_size
|
||||
# as the trigger of column family flush. Once WALs exceed this size, RocksDB
|
||||
# will start forcing the flush of column families to allow deletion of some
|
||||
# oldest WALs. This config can be useful when column families are updated at
|
||||
# non-uniform frequencies. If there's no size limit, users may need to keep
|
||||
# really old WALs when the infrequently-updated column families hasn't flushed
|
||||
# for a while.
|
||||
#
|
||||
# In kvrocks, we use multiple column families to store metadata, subkeys, etc.
|
||||
# If users always use string type, but use list, hash and other complex data types
|
||||
# infrequently, there will be a lot of old WALs if we don't set size limit
|
||||
# (0 by default in rocksdb), because rocksdb will dynamically choose the WAL size
|
||||
# limit to be [sum of all write_buffer_size * max_write_buffer_number] * 4 if set to 0.
|
||||
#
|
||||
# Moreover, you should increase this value if you already set rocksdb.write_buffer_size
|
||||
# to a big value, to avoid influencing the effect of rocksdb.write_buffer_size and
|
||||
# rocksdb.max_write_buffer_number.
|
||||
#
|
||||
# default is 512MB
|
||||
rocksdb.max_total_wal_size 512
|
||||
|
||||
# We implement the replication with rocksdb WAL, it would trigger full sync when the seq was out of range.
|
||||
# wal_ttl_seconds and wal_size_limit_mb would affect how archived logs will be deleted.
|
||||
# If WAL_ttl_seconds is not 0, then WAL files will be checked every WAL_ttl_seconds / 2 and those that
|
||||
# are older than WAL_ttl_seconds will be deleted#
|
||||
#
|
||||
# Default: 3 Hours
|
||||
rocksdb.wal_ttl_seconds 10800
|
||||
|
||||
# If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
|
||||
# WAL files will be checked every 10 min and if total size is greater
|
||||
# then WAL_size_limit_MB, they will be deleted starting with the
|
||||
# earliest until size_limit is met. All empty files will be deleted
|
||||
# Default: 16GB
|
||||
rocksdb.wal_size_limit_mb 16384
|
||||
|
||||
# Approximate size of user data packed per block. Note that the
|
||||
# block size specified here corresponds to uncompressed data. The
|
||||
# actual size of the unit read from disk may be smaller if
|
||||
# compression is enabled.
|
||||
#
|
||||
# Default: 16KB
|
||||
rocksdb.block_size 16384
|
||||
|
||||
# Indicating if we'd put index/filter blocks to the block cache
|
||||
#
|
||||
# Default: yes
|
||||
rocksdb.cache_index_and_filter_blocks yes
|
||||
|
||||
# Specify the compression to use. Only compress level greater
|
||||
# than 2 to improve performance.
|
||||
# Accept value: "no", "snappy", "lz4", "zstd", "zlib"
|
||||
# default snappy
|
||||
rocksdb.compression snappy
|
||||
|
||||
# If non-zero, we perform bigger reads when doing compaction. If you're
|
||||
# running RocksDB on spinning disks, you should set this to at least 2MB.
|
||||
# That way RocksDB's compaction is doing sequential instead of random reads.
|
||||
# When non-zero, we also force new_table_reader_for_compaction_inputs to
|
||||
# true.
|
||||
#
|
||||
# Default: 2 MB
|
||||
rocksdb.compaction_readahead_size 2097152
|
||||
|
||||
# he limited write rate to DB if soft_pending_compaction_bytes_limit or
|
||||
# level0_slowdown_writes_trigger is triggered.
|
||||
|
||||
# If the value is 0, we will infer a value from `rater_limiter` value
|
||||
# if it is not empty, or 16MB if `rater_limiter` is empty. Note that
|
||||
# if users change the rate in `rate_limiter` after DB is opened,
|
||||
# `delayed_write_rate` won't be adjusted.
|
||||
#
|
||||
rocksdb.delayed_write_rate 0
|
||||
# If enable_pipelined_write is true, separate write thread queue is
|
||||
# maintained for WAL write and memtable write.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.enable_pipelined_write no
|
||||
|
||||
# Soft limit on number of level-0 files. We start slowing down writes at this
|
||||
# point. A value <0 means that no writing slow down will be triggered by
|
||||
# number of files in level-0.
|
||||
#
|
||||
# Default: 20
|
||||
rocksdb.level0_slowdown_writes_trigger 20
|
||||
|
||||
# Maximum number of level-0 files. We stop writes at this point.
|
||||
#
|
||||
# Default: 40
|
||||
rocksdb.level0_stop_writes_trigger 40
|
||||
|
||||
# Number of files to trigger level-0 compaction.
|
||||
#
|
||||
# Default: 4
|
||||
rocksdb.level0_file_num_compaction_trigger 4
|
||||
|
||||
# if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
|
||||
#
|
||||
# Default: 0
|
||||
rocksdb.stats_dump_period_sec 0
|
||||
|
||||
# if yes, the auto compaction would be disabled, but the manual compaction remain works
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.disable_auto_compactions no
|
||||
|
||||
# BlobDB(key-value separation) is essentially RocksDB for large-value use cases.
|
||||
# Since 6.18.0, The new implementation is integrated into the RocksDB core.
|
||||
# When set, large values (blobs) are written to separate blob files, and only
|
||||
# pointers to them are stored in SST files. This can reduce write amplification
|
||||
# for large-value use cases at the cost of introducing a level of indirection
|
||||
# for reads. Please see: https://github.com/facebook/rocksdb/wiki/BlobDB.
|
||||
#
|
||||
# Note that when enable_blob_files is set to yes, BlobDB-related configuration
|
||||
# items will take effect.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.enable_blob_files no
|
||||
|
||||
# The size of the smallest value to be stored separately in a blob file. Values
|
||||
# which have an uncompressed size smaller than this threshold are stored alongside
|
||||
# the keys in SST files in the usual fashion.
|
||||
#
|
||||
# Default: 4096 byte, 0 means that all values are stored in blob files
|
||||
rocksdb.min_blob_size 4096
|
||||
|
||||
# The size limit for blob files. When writing blob files, a new file is
|
||||
# opened once this limit is reached.
|
||||
#
|
||||
# Default: 268435456 bytes
|
||||
rocksdb.blob_file_size 268435456
|
||||
|
||||
# Enables garbage collection of blobs. Valid blobs residing in blob files
|
||||
# older than a cutoff get relocated to new files as they are encountered
|
||||
# during compaction, which makes it possible to clean up blob files once
|
||||
# they contain nothing but obsolete/garbage blobs.
|
||||
# See also rocksdb.blob_garbage_collection_age_cutoff below.
|
||||
#
|
||||
# Default: yes
|
||||
rocksdb.enable_blob_garbage_collection yes
|
||||
|
||||
# The percentage cutoff in terms of blob file age for garbage collection.
|
||||
# Blobs in the oldest N blob files will be relocated when encountered during
|
||||
# compaction, where N = (garbage_collection_cutoff/100) * number_of_blob_files.
|
||||
# Note that this value must belong to [0, 100].
|
||||
#
|
||||
# Default: 25
|
||||
rocksdb.blob_garbage_collection_age_cutoff 25
|
||||
|
||||
|
||||
# The purpose of the following three options are to dynamically adjust the upper limit of
|
||||
# the data that each layer can store according to the size of the different
|
||||
# layers of the LSM. Enabling this option will bring some improvements in
|
||||
# deletion efficiency and space amplification, but it will lose a certain
|
||||
# amount of read performance.
|
||||
# If you want to know more details about Levels' Target Size, you can read RocksDB wiki:
|
||||
# https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#levels-target-size
|
||||
#
|
||||
# Default: yes
|
||||
rocksdb.level_compaction_dynamic_level_bytes yes
|
||||
|
||||
# The total file size of level-1 sst.
|
||||
#
|
||||
# Default: 268435456 bytes
|
||||
rocksdb.max_bytes_for_level_base 268435456
|
||||
|
||||
# Multiplication factor for the total file size of L(n+1) layers.
|
||||
# This option is a double type number in RocksDB, but kvrocks is
|
||||
# not support the double data type number yet, so we use integer
|
||||
# number instead of double currently.
|
||||
#
|
||||
# Default: 10
|
||||
rocksdb.max_bytes_for_level_multiplier 10
|
||||
|
||||
# This feature only takes effect in Iterators and MultiGet.
|
||||
# If yes, RocksDB will try to read asynchronously and in parallel as much as possible to hide IO latency.
|
||||
# In iterators, it will prefetch data asynchronously in the background for each file being iterated on.
|
||||
# In MultiGet, it will read the necessary data blocks from those files in parallel as much as possible.
|
||||
|
||||
# Default no
|
||||
rocksdb.read_options.async_io no
|
||||
|
||||
# If yes, the write will be flushed from the operating system
|
||||
# buffer cache before the write is considered complete.
|
||||
# If this flag is enabled, writes will be slower.
|
||||
# If this flag is disabled, and the machine crashes, some recent
|
||||
# rites may be lost. Note that if it is just the process that
|
||||
# crashes (i.e., the machine does not reboot), no writes will be
|
||||
# lost even if sync==false.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.write_options.sync no
|
||||
|
||||
# If yes, writes will not first go to the write ahead log,
|
||||
# and the write may get lost after a crash.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.write_options.disable_wal no
|
||||
|
||||
# If enabled and we need to wait or sleep for the write request, fails
|
||||
# immediately.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.write_options.no_slowdown no
|
||||
|
||||
# If enabled, write requests are of lower priority if compaction is
|
||||
# behind. In this case, no_slowdown = true, the request will be canceled
|
||||
# immediately. Otherwise, it will be slowed down.
|
||||
# The slowdown value is determined by RocksDB to guarantee
|
||||
# it introduces minimum impacts to high priority writes.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.write_options.low_pri no
|
||||
|
||||
# If enabled, this writebatch will maintain the last insert positions of each
|
||||
# memtable as hints in concurrent write. It can improve write performance
|
||||
# in concurrent writes if keys in one writebatch are sequential.
|
||||
#
|
||||
# Default: no
|
||||
rocksdb.write_options.memtable_insert_hint_per_batch no
|
||||
|
||||
|
||||
# Support RocksDB auto-tune rate limiter for the background IO
|
||||
# if enabled, Rate limiter will limit the compaction write if flush write is high
|
||||
# Please see https://rocksdb.org/blog/2017/12/18/17-auto-tuned-rate-limiter.html
|
||||
#
|
||||
# Default: yes
|
||||
rocksdb.rate_limiter_auto_tuned yes
|
||||
|
||||
# Enable this option will schedule the deletion of obsolete files in a background thread
|
||||
# on iterator destruction. It can reduce the latency if there are many files to be removed.
|
||||
# see https://github.com/facebook/rocksdb/wiki/IO#avoid-blocking-io
|
||||
#
|
||||
# Default: yes
|
||||
# rocksdb.avoid_unnecessary_blocking_io yes
|
||||
|
||||
################################ NAMESPACE #####################################
|
||||
# namespace.test change.me
|
||||
backup-dir .//backup
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
../../kvrocks/build/kvrocks -c kvrocks.conf
|
|
@ -93,6 +93,10 @@ class CaptureCache():
|
|||
self.user_agent: str | None = cache_entry.get('user_agent')
|
||||
self.referer: str | None = cache_entry.get('referer')
|
||||
|
||||
@property
|
||||
def tree_ready(self) -> bool:
|
||||
return bool(_pickle_path(self.capture_dir))
|
||||
|
||||
@property
|
||||
def tree(self) -> CrawledTree:
|
||||
if not self.capture_dir.exists():
|
||||
|
@ -102,27 +106,36 @@ class CaptureCache():
|
|||
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
def _pickle_path(capture_dir: Path) -> Path | None:
|
||||
pickle_file_gz = capture_dir / 'tree.pickle.gz'
|
||||
if pickle_file.exists():
|
||||
pickle_file.unlink()
|
||||
if pickle_file_gz.exists():
|
||||
pickle_file_gz.unlink()
|
||||
return pickle_file_gz
|
||||
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
return pickle_file
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_path = _pickle_path(capture_dir)
|
||||
if pickle_path and pickle_path.exists():
|
||||
pickle_path.unlink()
|
||||
|
||||
|
||||
@lru_cache(maxsize=64)
|
||||
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
pickle_file_gz = capture_dir / 'tree.pickle.gz'
|
||||
pickle_path = _pickle_path(capture_dir)
|
||||
tree = None
|
||||
try:
|
||||
if pickle_file.exists():
|
||||
with pickle_file.open('rb') as _p:
|
||||
tree = pickle.load(_p)
|
||||
elif pickle_file_gz.exists():
|
||||
with gzip.open(pickle_file_gz, 'rb') as _pg:
|
||||
tree = pickle.load(_pg)
|
||||
if pickle_path:
|
||||
if pickle_path.suffix == '.gz':
|
||||
with gzip.open(pickle_path, 'rb') as _pg:
|
||||
tree = pickle.load(_pg)
|
||||
else: # not a GZ pickle
|
||||
with pickle_path.open('rb') as _p:
|
||||
tree = pickle.load(_p)
|
||||
except pickle.UnpicklingError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except EOFError:
|
||||
|
|
|
@ -95,8 +95,10 @@ def safe_create_dir(to_create: Path) -> None:
|
|||
def get_socket_path(name: str) -> str:
|
||||
mapping = {
|
||||
'cache': Path('cache', 'cache.sock'),
|
||||
'indexing': Path('indexing', 'indexing.sock'),
|
||||
'indexing': Path('indexing', 'indexing.sock')
|
||||
}
|
||||
if get_config('generic', 'index_everything'):
|
||||
mapping['full_index'] = Path('full_index', 'full_index.sock')
|
||||
return str(get_homedir() / mapping[name])
|
||||
|
||||
|
||||
|
|
|
@ -24,24 +24,49 @@ from .default import get_socket_path, get_config
|
|||
|
||||
class Indexing():
|
||||
|
||||
def __init__(self) -> None:
|
||||
def __init__(self, full_index: bool=False) -> None:
|
||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
||||
self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('indexing'))
|
||||
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('indexing'), decode_responses=True)
|
||||
self.__redis_pool_bytes: ConnectionPool
|
||||
self.__redis_pool: ConnectionPool
|
||||
if full_index:
|
||||
self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('full_index'))
|
||||
self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('full_index'), decode_responses=True)
|
||||
else:
|
||||
self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('indexing'))
|
||||
self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('indexing'), decode_responses=True)
|
||||
|
||||
def clear_indexes(self) -> None:
|
||||
self.redis.flushdb()
|
||||
|
||||
@property
|
||||
def redis_bytes(self) -> Redis: # type: ignore[type-arg]
|
||||
return Redis(connection_pool=self.redis_pool_bytes)
|
||||
return Redis(connection_pool=self.__redis_pool_bytes)
|
||||
|
||||
@property
|
||||
def redis(self) -> Redis: # type: ignore[type-arg]
|
||||
return Redis(connection_pool=self.redis_pool)
|
||||
return Redis(connection_pool=self.__redis_pool)
|
||||
|
||||
@property
|
||||
def can_index(self) -> bool:
|
||||
return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
|
||||
|
||||
def indexing_done(self) -> None:
|
||||
self.redis.delete('ongoing_indexing')
|
||||
|
||||
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]:
|
||||
p = self.redis.pipeline()
|
||||
p.sismember('indexed_urls', capture_uuid)
|
||||
p.sismember('indexed_body_hashes', capture_uuid)
|
||||
p.sismember('indexed_cookies', capture_uuid)
|
||||
p.sismember('indexed_hhhashes', capture_uuid)
|
||||
p.sismember('indexed_favicons', capture_uuid)
|
||||
# This call for sure returns a tuple of 5 booleans
|
||||
return p.execute() # type: ignore[return-value]
|
||||
|
||||
def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:
|
||||
# only trigger this method if the capture was already indexed.
|
||||
|
|
|
@ -56,7 +56,6 @@ from .helpers import (get_captures_dir, get_email_template,
|
|||
get_resources_hashes, get_taxonomies,
|
||||
uniq_domains, ParsedUserAgent, load_cookies, UserAgents,
|
||||
get_useragent_for_requests, make_ts_from_dirname)
|
||||
from .indexing import Indexing
|
||||
from .modules import (MISPs, PhishingInitiative, UniversalWhois,
|
||||
UrlScan, VirusTotal, Phishtank, Hashlookup,
|
||||
RiskIQ, RiskIQError, Pandora, URLhaus, CIRCLPDNS)
|
||||
|
@ -81,7 +80,6 @@ class Lookyloo():
|
|||
def __init__(self) -> None:
|
||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
||||
self.indexing = Indexing()
|
||||
self.user_agents = UserAgents()
|
||||
self.is_public_instance = get_config('generic', 'public_instance')
|
||||
self.public_domain = get_config('generic', 'public_domain')
|
||||
|
@ -938,214 +936,10 @@ class Lookyloo():
|
|||
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
|
||||
- set(ct.root_hartree.all_url_requests.keys()))
|
||||
|
||||
def get_body_hash_investigator(self, body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]:
|
||||
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
||||
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
||||
captures = []
|
||||
for capture_uuid, hostnode_uuid, hostname, _, url in details:
|
||||
cache = self.capture_cache(capture_uuid)
|
||||
if not cache:
|
||||
continue
|
||||
captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url))
|
||||
domains = self.indexing.get_body_hash_domains(body_hash)
|
||||
return captures, domains
|
||||
|
||||
def get_body_hash_full(self, body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]:
|
||||
'''Returns a lot of information about the hash (sha512) and the hits in the instance.
|
||||
Also contains the data (base64 encoded)'''
|
||||
details = self.indexing.get_body_hash_urls(body_hash)
|
||||
|
||||
# Break immediately if we have the hash of the empty file
|
||||
if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e':
|
||||
return details, BytesIO()
|
||||
|
||||
# get the body from the first entry in the details list
|
||||
for _, entries in details.items():
|
||||
if not entries:
|
||||
continue
|
||||
ct = self.get_crawled_tree(entries[0]['capture'])
|
||||
try:
|
||||
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
|
||||
except Exception:
|
||||
# Unable to find URLnode in the tree, it probably has been rebuild.
|
||||
self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]')
|
||||
continue
|
||||
|
||||
# From that point, we just try to get the content. Break as soon as we found one.
|
||||
if urlnode.body_hash == body_hash:
|
||||
# the hash we're looking for is the whole file
|
||||
return details, urlnode.body
|
||||
else:
|
||||
# The hash is an embedded resource
|
||||
for _, blobs in urlnode.embedded_ressources.items():
|
||||
for h, b in blobs:
|
||||
if h == body_hash:
|
||||
return details, b
|
||||
|
||||
# TODO: Couldn't find the file anywhere. Maybe return a warning in the file?
|
||||
return details, BytesIO()
|
||||
|
||||
def get_all_body_hashes(self, capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]:
|
||||
ct = self.get_crawled_tree(capture_uuid)
|
||||
to_return: dict[str, dict[str, URLNode | int]] = defaultdict()
|
||||
for node in ct.root_hartree.url_tree.traverse():
|
||||
if node.empty_response or node.body_hash in to_return:
|
||||
# If we have the same hash more than once, skip
|
||||
continue
|
||||
total_captures, details = self.indexing.get_body_hash_captures(node.body_hash, limit=-1)
|
||||
# Note for future: mayeb get url, capture title, something better than just the hash to show to the user
|
||||
to_return[node.body_hash] = {'node': node, 'total_captures': total_captures}
|
||||
return to_return
|
||||
|
||||
def get_latest_url_capture(self, url: str, /) -> CaptureCache | None:
|
||||
'''Get the most recent capture with this URL'''
|
||||
captures = self.sorted_capture_cache(self.indexing.get_captures_url(url))
|
||||
if captures:
|
||||
return captures[0]
|
||||
return None
|
||||
|
||||
def get_url_occurrences(self, url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
|
||||
'''Get the most recent captures and URL nodes where the URL has been seen.'''
|
||||
captures = self.sorted_capture_cache(self.indexing.get_captures_url(url), cached_captures_only=cached_captures_only)
|
||||
|
||||
to_return: list[dict[str, Any]] = []
|
||||
for capture in captures[:limit]:
|
||||
ct = self.get_crawled_tree(capture.uuid)
|
||||
to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid,
|
||||
'start_timestamp': capture.timestamp.isoformat(),
|
||||
'title': capture.title}
|
||||
urlnodes: dict[str, dict[str, str]] = {}
|
||||
for urlnode in ct.root_hartree.url_tree.search_nodes(name=url):
|
||||
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
|
||||
'hostnode_uuid': urlnode.hostnode_uuid}
|
||||
if hasattr(urlnode, 'body_hash'):
|
||||
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
|
||||
to_append['urlnodes'] = urlnodes
|
||||
to_return.append(to_append)
|
||||
return to_return
|
||||
|
||||
def get_hostname_occurrences(self, hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
|
||||
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
|
||||
captures = self.sorted_capture_cache(self.indexing.get_captures_hostname(hostname), cached_captures_only=cached_captures_only)
|
||||
|
||||
to_return: list[dict[str, Any]] = []
|
||||
for capture in captures[:limit]:
|
||||
ct = self.get_crawled_tree(capture.uuid)
|
||||
to_append: dict[str, str | list[Any] | dict[str, Any]] = {
|
||||
'capture_uuid': capture.uuid,
|
||||
'start_timestamp': capture.timestamp.isoformat(),
|
||||
'title': capture.title}
|
||||
hostnodes: list[str] = []
|
||||
if with_urls_occurrences:
|
||||
urlnodes: dict[str, dict[str, str]] = {}
|
||||
for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname):
|
||||
hostnodes.append(hostnode.uuid)
|
||||
if with_urls_occurrences:
|
||||
for urlnode in hostnode.urls:
|
||||
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
|
||||
'url': urlnode.name,
|
||||
'hostnode_uuid': urlnode.hostnode_uuid}
|
||||
if hasattr(urlnode, 'body_hash'):
|
||||
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
|
||||
to_append['hostnodes'] = hostnodes
|
||||
if with_urls_occurrences:
|
||||
to_append['urlnodes'] = urlnodes
|
||||
to_return.append(to_append)
|
||||
return to_return
|
||||
|
||||
def get_cookie_name_investigator(self, cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]:
|
||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||
cached_captures = self.sorted_capture_cache([entry[0] for entry in self.indexing.get_cookies_names_captures(cookie_name)])
|
||||
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
||||
domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain))
|
||||
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
|
||||
return captures, domains
|
||||
|
||||
def compute_mmh3_shodan(self, favicon: bytes, /) -> str:
|
||||
b64 = base64.encodebytes(favicon)
|
||||
return str(mmh3.hash(b64))
|
||||
|
||||
def get_favicon_investigator(self, favicon_sha512: str,
|
||||
/,
|
||||
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
|
||||
tuple[str, str, str],
|
||||
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
|
||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
|
||||
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
|
||||
favicon = self.indexing.get_favicon(favicon_sha512)
|
||||
if favicon:
|
||||
mimetype = from_string(favicon, mime=True)
|
||||
b64_favicon = base64.b64encode(favicon).decode()
|
||||
mmh3_shodan = self.compute_mmh3_shodan(favicon)
|
||||
else:
|
||||
mimetype = ''
|
||||
b64_favicon = ''
|
||||
mmh3_shodan = ''
|
||||
|
||||
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
|
||||
probabilistic_hash_algos = ['mmh3-shodan']
|
||||
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
|
||||
if get_probabilistic:
|
||||
for algo in probabilistic_hash_algos:
|
||||
probabilistic_favicons[algo] = {}
|
||||
for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512):
|
||||
probabilistic_favicons[algo][mm3hash] = {}
|
||||
for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash):
|
||||
if sha512 == favicon_sha512:
|
||||
# Skip entry if it is the same as the favicon we are investigating
|
||||
continue
|
||||
favicon = self.indexing.get_favicon(sha512)
|
||||
if favicon:
|
||||
mimetype = from_string(favicon, mime=True)
|
||||
b64_favicon = base64.b64encode(favicon).decode()
|
||||
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
|
||||
if not probabilistic_favicons[algo][mm3hash]:
|
||||
# remove entry if it has no favicon
|
||||
probabilistic_favicons[algo].pop(mm3hash)
|
||||
if not probabilistic_favicons[algo]:
|
||||
# remove entry if it has no hash
|
||||
probabilistic_favicons.pop(algo)
|
||||
return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons
|
||||
|
||||
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
|
||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||
all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh))
|
||||
if cached_captures := self.sorted_capture_cache([entry for entry in all_captures]):
|
||||
captures = []
|
||||
for cache in cached_captures:
|
||||
try:
|
||||
urlnode = self.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid])
|
||||
except Exception as e:
|
||||
self.logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.')
|
||||
self._captures_index.remove_pickle(cache.uuid)
|
||||
continue
|
||||
captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title))
|
||||
# get the headers and format them as they were in the response
|
||||
urlnode = self.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid])
|
||||
headers = [(header["name"], header["value"]) for header in urlnode.response['headers']]
|
||||
return captures, headers
|
||||
return [], []
|
||||
|
||||
def hash_lookup(self, blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
|
||||
'''Search all the captures a specific hash was seen.
|
||||
If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
|
||||
Capture UUID avoids duplicates on the same capture'''
|
||||
captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
|
||||
total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1,
|
||||
prefered_uuids=set(self._captures_index.keys()))
|
||||
for h_capture_uuid, url_uuid, url_hostname, same_url, url in details:
|
||||
cache = self.capture_cache(h_capture_uuid)
|
||||
if cache and hasattr(cache, 'title'):
|
||||
if same_url:
|
||||
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
||||
else:
|
||||
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
||||
# Sort by timestamp by default
|
||||
captures_list['same_url'].sort(key=lambda y: y[3])
|
||||
captures_list['different_url'].sort(key=lambda y: y[3])
|
||||
return total_captures, captures_list
|
||||
|
||||
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: str | None) -> tuple[str, BytesIO, str] | None:
|
||||
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
|
||||
try:
|
||||
|
@ -1375,116 +1169,6 @@ class Lookyloo():
|
|||
"""Get the preconfigured devices from Playwright"""
|
||||
return get_devices()
|
||||
|
||||
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> tuple[HostNode, list[dict[str, Any]]]:
|
||||
'''Gather all the informations needed to display the Hostnode investigator popup.'''
|
||||
|
||||
def normalize_known_content(h: str, /, known_content: dict[str, Any], url: URLNode) -> tuple[str | list[Any] | None, tuple[bool, Any] | None]:
|
||||
''' There are a few different sources to figure out known vs. legitimate content,
|
||||
this method normalize it for the web interface.'''
|
||||
known: str | list[Any] | None = None
|
||||
legitimate: tuple[bool, Any] | None = None
|
||||
if h not in known_content:
|
||||
return known, legitimate
|
||||
|
||||
if known_content[h]['type'] in ['generic', 'sanejs']:
|
||||
known = known_content[h]['details']
|
||||
elif known_content[h]['type'] == 'legitimate_on_domain':
|
||||
legit = False
|
||||
if url.hostname in known_content[h]['details']:
|
||||
legit = True
|
||||
legitimate = (legit, known_content[h]['details'])
|
||||
elif known_content[h]['type'] == 'malicious':
|
||||
legitimate = (False, known_content[h]['details'])
|
||||
|
||||
return known, legitimate
|
||||
|
||||
ct = self.get_crawled_tree(capture_uuid)
|
||||
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
||||
|
||||
known_content = self.context.find_known_content(hostnode)
|
||||
self.uwhois.query_whois_hostnode(hostnode)
|
||||
|
||||
urls: list[dict[str, Any]] = []
|
||||
for url in hostnode.urls:
|
||||
# For the popup, we need:
|
||||
# * https vs http
|
||||
# * everything after the domain
|
||||
# * the full URL
|
||||
to_append: dict[str, Any] = {
|
||||
'encrypted': url.name.startswith('https'),
|
||||
'url_path': url.name.split('/', 3)[-1],
|
||||
'url_object': url,
|
||||
}
|
||||
|
||||
if not url.empty_response:
|
||||
# Index lookup
|
||||
# %%% Full body %%%
|
||||
freq = self.indexing.body_hash_fequency(url.body_hash)
|
||||
to_append['body_hash_details'] = freq
|
||||
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
|
||||
to_append['body_hash_details']['other_captures'] = self.hash_lookup(url.body_hash, url.name, capture_uuid)
|
||||
|
||||
# %%% Embedded ressources %%%
|
||||
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
|
||||
to_append['embedded_ressources'] = {}
|
||||
for mimetype, blobs in url.embedded_ressources.items():
|
||||
for h, blob in blobs:
|
||||
if h in to_append['embedded_ressources']:
|
||||
# Skip duplicates
|
||||
continue
|
||||
freq_embedded = self.indexing.body_hash_fequency(h)
|
||||
to_append['embedded_ressources'][h] = freq_embedded
|
||||
to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes
|
||||
to_append['embedded_ressources'][h]['type'] = mimetype
|
||||
if freq_embedded['hash_freq'] > 1:
|
||||
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
|
||||
for h in to_append['embedded_ressources'].keys():
|
||||
known, legitimate = normalize_known_content(h, known_content, url)
|
||||
if known:
|
||||
to_append['embedded_ressources'][h]['known_content'] = known
|
||||
elif legitimate:
|
||||
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
|
||||
|
||||
known, legitimate = normalize_known_content(url.body_hash, known_content, url)
|
||||
if known:
|
||||
to_append['known_content'] = known
|
||||
elif legitimate:
|
||||
to_append['legitimacy'] = legitimate
|
||||
|
||||
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
|
||||
if hasattr(url, 'cookies_sent'):
|
||||
to_display_sent: dict[str, set[Iterable[str | None]]] = defaultdict(set)
|
||||
for cookie, contexts in url.cookies_sent.items():
|
||||
if not contexts:
|
||||
# Locally created?
|
||||
to_display_sent[cookie].add(('Unknown origin', ))
|
||||
continue
|
||||
for context in contexts:
|
||||
to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid))
|
||||
to_append['cookies_sent'] = to_display_sent
|
||||
|
||||
# Optional: Cookies received from server in response -> map to nodes who send the cookie in request
|
||||
if hasattr(url, 'cookies_received'):
|
||||
to_display_received: dict[str, dict[str, set[Iterable[str | None]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)}
|
||||
for domain, c_received, is_3rd_party in url.cookies_received:
|
||||
if c_received not in ct.root_hartree.cookies_sent:
|
||||
# This cookie is never sent.
|
||||
if is_3rd_party:
|
||||
to_display_received['3rd_party'][c_received].add((domain, ))
|
||||
else:
|
||||
to_display_received['not_sent'][c_received].add((domain, ))
|
||||
continue
|
||||
|
||||
for url_node in ct.root_hartree.cookies_sent[c_received]:
|
||||
if is_3rd_party:
|
||||
to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
|
||||
else:
|
||||
to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
|
||||
to_append['cookies_received'] = to_display_received
|
||||
|
||||
urls.append(to_append)
|
||||
return hostnode, urls
|
||||
|
||||
def get_stats(self) -> dict[str, list[Any]]:
|
||||
'''Gather statistics about the lookyloo instance'''
|
||||
today = date.today()
|
||||
|
|
|
@ -1,25 +1,25 @@
|
|||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiobotocore"
|
||||
version = "2.11.2"
|
||||
version = "2.12.1"
|
||||
description = "Async client for aws services using botocore and aiohttp"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "aiobotocore-2.11.2-py3-none-any.whl", hash = "sha256:487fede588040bfa3a43df945275c28c1c73ca75bf705295adb9fbadd2e89be7"},
|
||||
{file = "aiobotocore-2.11.2.tar.gz", hash = "sha256:6dd7352248e3523019c5a54a395d2b1c31080697fc80a9ad2672de4eec8c7abd"},
|
||||
{file = "aiobotocore-2.12.1-py3-none-any.whl", hash = "sha256:6a9a3d646cf422f45fdc1e4256e78563ebffba64733bc9b8ca9123614e8ba9af"},
|
||||
{file = "aiobotocore-2.12.1.tar.gz", hash = "sha256:8706b28f16f93c541f6ed50352115a79d8f3499539f8d0bb70aa0f7a5379c1fe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aiohttp = ">=3.7.4.post0,<4.0.0"
|
||||
aioitertools = ">=0.5.1,<1.0.0"
|
||||
botocore = ">=1.33.2,<1.34.35"
|
||||
botocore = ">=1.34.41,<1.34.52"
|
||||
wrapt = ">=1.10.10,<2.0.0"
|
||||
|
||||
[package.extras]
|
||||
awscli = ["awscli (>=1.31.2,<1.32.35)"]
|
||||
boto3 = ["boto3 (>=1.33.2,<1.34.35)"]
|
||||
awscli = ["awscli (>=1.32.41,<1.32.52)"]
|
||||
boto3 = ["boto3 (>=1.34.41,<1.34.52)"]
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
|
@ -308,13 +308,13 @@ WTForms = "*"
|
|||
|
||||
[[package]]
|
||||
name = "botocore"
|
||||
version = "1.34.34"
|
||||
version = "1.34.51"
|
||||
description = "Low-level, data-driven core of boto 3."
|
||||
optional = false
|
||||
python-versions = ">= 3.8"
|
||||
files = [
|
||||
{file = "botocore-1.34.34-py3-none-any.whl", hash = "sha256:cd060b0d88ebb2b893f1411c1db7f2ba66cc18e52dcc57ad029564ef5fec437b"},
|
||||
{file = "botocore-1.34.34.tar.gz", hash = "sha256:54093dc97372bb7683f5c61a279aa8240408abf3b2cc494ae82a9a90c1b784b5"},
|
||||
{file = "botocore-1.34.51-py3-none-any.whl", hash = "sha256:01d5156247f991b3466a8404e3d7460a9ecbd9b214f9992d6ba797d9ddc6f120"},
|
||||
{file = "botocore-1.34.51.tar.gz", hash = "sha256:5086217442e67dd9de36ec7e87a0c663f76b7790d5fb6a12de565af95e87e319"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -1331,13 +1331,13 @@ test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.22)", "pa
|
|||
|
||||
[[package]]
|
||||
name = "ipython"
|
||||
version = "8.22.1"
|
||||
version = "8.22.2"
|
||||
description = "IPython: Productive Interactive Computing"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
files = [
|
||||
{file = "ipython-8.22.1-py3-none-any.whl", hash = "sha256:869335e8cded62ffb6fac8928e5287a05433d6462e3ebaac25f4216474dd6bc4"},
|
||||
{file = "ipython-8.22.1.tar.gz", hash = "sha256:39c6f9efc079fb19bfb0f17eee903978fe9a290b1b82d68196c641cecb76ea22"},
|
||||
{file = "ipython-8.22.2-py3-none-any.whl", hash = "sha256:3c86f284c8f3d8f2b6c662f885c4889a91df7cd52056fd02b7d8d6195d7f56e9"},
|
||||
{file = "ipython-8.22.2.tar.gz", hash = "sha256:2dcaad9049f9056f1fef63514f176c7d41f930daa78d05b82a176202818f2c14"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -1463,20 +1463,21 @@ referencing = ">=0.31.0"
|
|||
|
||||
[[package]]
|
||||
name = "lacuscore"
|
||||
version = "1.8.7"
|
||||
version = "1.8.8"
|
||||
description = "Core of Lacus, usable as a module"
|
||||
optional = false
|
||||
python-versions = ">=3.8,<4.0"
|
||||
files = [
|
||||
{file = "lacuscore-1.8.7-py3-none-any.whl", hash = "sha256:1ac849b1308eb780f1976fdf21d6476bd911e6ae1e91f79d83612baef90afee8"},
|
||||
{file = "lacuscore-1.8.7.tar.gz", hash = "sha256:67268fb4da1282d1c7f747b02611dd5ee549644e034d0acba2173396ce0d0408"},
|
||||
{file = "lacuscore-1.8.8-py3-none-any.whl", hash = "sha256:04812225e101ec59b3d1dcc6d3474e3cd2f3fd656a72d619e7d7d238d00b5a27"},
|
||||
{file = "lacuscore-1.8.8.tar.gz", hash = "sha256:41949ff67d056f8ba717b649d8b45307ff7d38d4c38291cb1a8b80ca2ce94f6f"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = {version = ">=4.0.3,<5.0.0", markers = "python_version < \"3.11\""}
|
||||
defang = ">=0.5.3,<0.6.0"
|
||||
dnspython = ">=2.6.1,<3.0.0"
|
||||
playwrightcapture = {version = ">=1.23.8,<2.0.0", extras = ["recaptcha"]}
|
||||
redis = {version = ">=5.0.1,<6.0.0", extras = ["hiredis"]}
|
||||
playwrightcapture = {version = ">=1.23.9,<2.0.0", extras = ["recaptcha"]}
|
||||
redis = {version = ">=5.0.2,<6.0.0", extras = ["hiredis"]}
|
||||
requests = ">=2.31.0,<3.0.0"
|
||||
ua-parser = ">=0.18.0,<0.19.0"
|
||||
|
||||
|
@ -2286,13 +2287,13 @@ test = ["pytest"]
|
|||
|
||||
[[package]]
|
||||
name = "playwrightcapture"
|
||||
version = "1.23.8"
|
||||
version = "1.23.9"
|
||||
description = "A simple library to capture websites using playwright"
|
||||
optional = false
|
||||
python-versions = ">=3.8,<4.0"
|
||||
files = [
|
||||
{file = "playwrightcapture-1.23.8-py3-none-any.whl", hash = "sha256:f3e4d6c0355b013e465f9d3eea961b9431303a5de227a1388a7287c872203b9e"},
|
||||
{file = "playwrightcapture-1.23.8.tar.gz", hash = "sha256:d2caea8d7a16d739f28dc06bbbc12665be89d07d325bba6868dab5f8520db809"},
|
||||
{file = "playwrightcapture-1.23.9-py3-none-any.whl", hash = "sha256:0324f587605aa85ede1b71c12ec735383d932324f0e66ef35345c6e08734273c"},
|
||||
{file = "playwrightcapture-1.23.9.tar.gz", hash = "sha256:e7217fc2a6109f240918de977452c556f482822abb12f0db43fa28228d3c0c90"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2339,13 +2340,13 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "publicsuffixlist"
|
||||
version = "0.10.0.20240214"
|
||||
version = "0.10.0.20240305"
|
||||
description = "publicsuffixlist implement"
|
||||
optional = false
|
||||
python-versions = ">=2.6"
|
||||
files = [
|
||||
{file = "publicsuffixlist-0.10.0.20240214-py2.py3-none-any.whl", hash = "sha256:2c3b8da819571bb610328bda5b25d27fcbf6bc400896ca3c6502d291a16b32f4"},
|
||||
{file = "publicsuffixlist-0.10.0.20240214.tar.gz", hash = "sha256:45a206c5f9c1eccf138481280cfb0a67c2ccafc782ef89c7fd6dc6c4356230fe"},
|
||||
{file = "publicsuffixlist-0.10.0.20240305-py2.py3-none-any.whl", hash = "sha256:f6869119f8781501c0c625e59b4b65eb60e2ed5185cfd6c142c792f74ac47c21"},
|
||||
{file = "publicsuffixlist-0.10.0.20240305.tar.gz", hash = "sha256:6e79ea73b0278ce1b102f3ad6815f2a5b683864da9948ba0b0eab3180c419f7f"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
|
@ -2571,13 +2572,13 @@ docs = ["Sphinx (<7.2)", "Sphinx (>=7.2,<8.0)"]
|
|||
|
||||
[[package]]
|
||||
name = "pymisp"
|
||||
version = "2.4.185"
|
||||
version = "2.4.186"
|
||||
description = "Python API for MISP."
|
||||
optional = false
|
||||
python-versions = ">=3.8,<4.0"
|
||||
files = [
|
||||
{file = "pymisp-2.4.185-py3-none-any.whl", hash = "sha256:e2635a2be92321d4f812c7220bd955817e95a286343720f138b87892a827117a"},
|
||||
{file = "pymisp-2.4.185.tar.gz", hash = "sha256:3ccdc6ee48d26d82c77ba3f5d8fd41a79eaaef0ad5619f37a65b060e92f6da4c"},
|
||||
{file = "pymisp-2.4.186-py3-none-any.whl", hash = "sha256:bb8ae23d038848a86cf5d6a4c965dbed79e48cd6f671681b17f72410aecf07a0"},
|
||||
{file = "pymisp-2.4.186.tar.gz", hash = "sha256:bdf2d54b297ad890418179b044dd4ea79821fccef723823919d12262e9794ca3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2593,7 +2594,7 @@ requests = ">=2.31.0,<3.0.0"
|
|||
[package.extras]
|
||||
brotli = ["urllib3[brotli]"]
|
||||
docs = ["Sphinx (<7.2)", "Sphinx (>=7.2,<8.0)", "recommonmark (>=0.7.1,<0.8.0)", "sphinx-autodoc-typehints (>=2.0.0,<3.0.0)"]
|
||||
email = ["RTFDE (>=0.1.1,<0.2.0)", "extract_msg (>=0.47.0,<0.48.0)", "oletools (>=0.60.1,<0.61.0)"]
|
||||
email = ["RTFDE (>=0.1.1,<0.2.0)", "extract_msg (>=0.48.0,<0.49.0)", "oletools (>=0.60.1,<0.61.0)"]
|
||||
fileobjects = ["lief (>=0.14.1,<0.15.0)", "pydeep2 (>=0.5.1,<0.6.0)", "python-magic (>=0.4.27,<0.5.0)"]
|
||||
openioc = ["beautifulsoup4 (>=4.12.3,<5.0.0)"]
|
||||
pdfexport = ["reportlab (>=4.1.0,<5.0.0)"]
|
||||
|
@ -2668,13 +2669,13 @@ requests = ">=2.31.0,<3.0.0"
|
|||
|
||||
[[package]]
|
||||
name = "pysecuritytxt"
|
||||
version = "1.2.2"
|
||||
version = "1.3.0"
|
||||
description = "Python CLI and module for querying security.txt files on domains."
|
||||
optional = false
|
||||
python-versions = ">=3.8,<4.0"
|
||||
files = [
|
||||
{file = "pysecuritytxt-1.2.2-py3-none-any.whl", hash = "sha256:08d8750d82e9502ba949a6ea7bab355ca183cfc3cd722ed3e492ba35a8d4edda"},
|
||||
{file = "pysecuritytxt-1.2.2.tar.gz", hash = "sha256:31d4ea4814e2cdeffce304e7b6f9d58580e7fb6578c8694bb6f8c0df59e65b3d"},
|
||||
{file = "pysecuritytxt-1.3.0-py3-none-any.whl", hash = "sha256:9e4eb6b4fdca8f8471c80696c4d7642be24d44c8c3f627870ca9b7bd3f221cd5"},
|
||||
{file = "pysecuritytxt-1.3.0.tar.gz", hash = "sha256:3669be69e90672ed0d448b385e5fef49cb3a6a611d7e386d673c4f0e1cc3e83b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2712,13 +2713,13 @@ webui = ["Flask (>=2.0,<3.0)", "Flask-Bootstrap (>=3.3.7.1,<4.0.0.0)", "Flask-WT
|
|||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.8.2"
|
||||
version = "2.9.0.post0"
|
||||
description = "Extensions to the standard Python datetime module"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
|
||||
files = [
|
||||
{file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
|
||||
{file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
|
||||
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
|
||||
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2748,17 +2749,17 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "5.0.1"
|
||||
version = "5.0.2"
|
||||
description = "Python client for Redis database and key-value store"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "redis-5.0.1-py3-none-any.whl", hash = "sha256:ed4802971884ae19d640775ba3b03aa2e7bd5e8fb8dfaed2decce4d0fc48391f"},
|
||||
{file = "redis-5.0.1.tar.gz", hash = "sha256:0dab495cd5753069d3bc650a0dde8a8f9edde16fc5691b689a566eda58100d0f"},
|
||||
{file = "redis-5.0.2-py3-none-any.whl", hash = "sha256:4caa8e1fcb6f3c0ef28dba99535101d80934b7d4cd541bbb47f4a3826ee472d1"},
|
||||
{file = "redis-5.0.2.tar.gz", hash = "sha256:3f82cc80d350e93042c8e6e7a5d0596e4dd68715babffba79492733e1f367037"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2\""}
|
||||
async-timeout = ">=4.0.3"
|
||||
hiredis = {version = ">=1.0.0", optional = true, markers = "extra == \"hiredis\""}
|
||||
|
||||
[package.extras]
|
||||
|
@ -2942,6 +2943,7 @@ optional = false
|
|||
python-versions = "*"
|
||||
files = [
|
||||
{file = "requests-file-2.0.0.tar.gz", hash = "sha256:20c5931629c558fda566cacc10cfe2cd502433e628f568c34c80d96a0cc95972"},
|
||||
{file = "requests_file-2.0.0-py2.py3-none-any.whl", hash = "sha256:3e493d390adb44aa102ebea827a48717336d5268968c370eaf19abaf5cae13bf"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -2949,13 +2951,13 @@ requests = ">=1.0.0"
|
|||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "13.7.0"
|
||||
version = "13.7.1"
|
||||
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
{file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"},
|
||||
{file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"},
|
||||
{file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"},
|
||||
{file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -3217,13 +3219,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,
|
|||
|
||||
[[package]]
|
||||
name = "types-beautifulsoup4"
|
||||
version = "4.12.0.20240106"
|
||||
version = "4.12.0.20240229"
|
||||
description = "Typing stubs for beautifulsoup4"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-beautifulsoup4-4.12.0.20240106.tar.gz", hash = "sha256:98d628985b71b140bd3bc22a8cb0ab603c2f2d08f20d37925965eb4a21739be8"},
|
||||
{file = "types_beautifulsoup4-4.12.0.20240106-py3-none-any.whl", hash = "sha256:cbdd60ab8aeac737ac014431b6e921b43e84279c0405fdd25a6900bb0e71da5b"},
|
||||
{file = "types-beautifulsoup4-4.12.0.20240229.tar.gz", hash = "sha256:e37e4cfa11b03b01775732e56d2c010cb24ee107786277bae6bc0fa3e305b686"},
|
||||
{file = "types_beautifulsoup4-4.12.0.20240229-py3-none-any.whl", hash = "sha256:000cdddb8aee4effb45a04be95654de8629fb8594a4f2f1231cff81108977324"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -3242,13 +3244,13 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "types-html5lib"
|
||||
version = "1.1.11.20240222"
|
||||
version = "1.1.11.20240228"
|
||||
description = "Typing stubs for html5lib"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-html5lib-1.1.11.20240222.tar.gz", hash = "sha256:d9517ec6ba2fa1f63113e2930a59b60722a976cc983b94d7fd772f14865e1152"},
|
||||
{file = "types_html5lib-1.1.11.20240222-py3-none-any.whl", hash = "sha256:86b2dcbbebca846e68d2eac46b2717980e632de4b5d8f62ccd23d8333d2e7647"},
|
||||
{file = "types-html5lib-1.1.11.20240228.tar.gz", hash = "sha256:22736b7299e605ec4ba539d48691e905fd0c61c3ea610acc59922232dc84cede"},
|
||||
{file = "types_html5lib-1.1.11.20240228-py3-none-any.whl", hash = "sha256:af5de0125cb0fe5667543b158db83849b22e25c0e36c9149836b095548bf1020"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3275,13 +3277,13 @@ files = [
|
|||
|
||||
[[package]]
|
||||
name = "types-pyopenssl"
|
||||
version = "24.0.0.20240130"
|
||||
version = "24.0.0.20240228"
|
||||
description = "Typing stubs for pyOpenSSL"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-pyOpenSSL-24.0.0.20240130.tar.gz", hash = "sha256:c812e5c1c35249f75ef5935708b2a997d62abf9745be222e5f94b9595472ab25"},
|
||||
{file = "types_pyOpenSSL-24.0.0.20240130-py3-none-any.whl", hash = "sha256:24a255458b5b8a7fca8139cf56f2a8ad5a4f1a5f711b73a5bb9cb50dc688fab5"},
|
||||
{file = "types-pyOpenSSL-24.0.0.20240228.tar.gz", hash = "sha256:cd990717d8aa3743ef0e73e0f462e64b54d90c304249232d48fece4f0f7c3c6a"},
|
||||
{file = "types_pyOpenSSL-24.0.0.20240228-py3-none-any.whl", hash = "sha256:a472cf877a873549175e81972f153f44e975302a3cf17381eb5f3d41ccfb75a4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -3734,4 +3736,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<3.13"
|
||||
content-hash = "bc64701a9d95985f7d0c91086fabfc29a1c54affc60bfab612fecc3771d6acd4"
|
||||
content-hash = "7e76c4614efed850e101ecaa1e91f141649ef4ad508522f0323e8efffc9eda7d"
|
||||
|
|
|
@ -29,6 +29,8 @@ shutdown = "bin.shutdown:main"
|
|||
run_backend = "bin.run_backend:main"
|
||||
async_capture = "bin.async_capture:main"
|
||||
background_indexer = "bin.background_indexer:main"
|
||||
background_build_captures = "bin.background_build_captures:main"
|
||||
background_full_indexer = "bin.background_indexer:main_full_indexer"
|
||||
archiver = "bin.archiver:main"
|
||||
processing = "bin.background_processing:main"
|
||||
start_website = "bin.start_website:main"
|
||||
|
@ -40,7 +42,7 @@ requests = "^2.31.0"
|
|||
flask = "^3.0.2"
|
||||
gunicorn = "^21.2.0"
|
||||
charset-normalizer = "^3.3.2"
|
||||
redis = {version = "^5.0.1", extras = ["hiredis"]}
|
||||
redis = {version = "^5.0.2", extras = ["hiredis"]}
|
||||
beautifulsoup4 = {version = "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
||||
bootstrap-flask = "^2.3.3"
|
||||
defang = "^0.5.3"
|
||||
|
@ -50,10 +52,10 @@ pysanejs = "^2.0.2"
|
|||
pylookyloo = "^1.23.1"
|
||||
dnspython = "^2.6.1"
|
||||
pytaxonomies = "^1.5.0"
|
||||
pymisp = {version = "^2.4.185", extras = ["url", "fileobjects"]}
|
||||
pymisp = {version = "^2.4.186", extras = ["url", "fileobjects"]}
|
||||
Pillow = "^10.2.0"
|
||||
flask-restx = "^1.3.0"
|
||||
rich = "^13.7.0"
|
||||
rich = "^13.7.1"
|
||||
pyphishtanklookup = "^1.3.2"
|
||||
Flask-Cors = "^4.0.0"
|
||||
pyhashlookup = "^1.2.2"
|
||||
|
@ -65,13 +67,13 @@ passivetotal = "^2.5.9"
|
|||
werkzeug = "^3.0.1"
|
||||
filetype = "^1.2.0"
|
||||
pypandora = "^1.8.0"
|
||||
lacuscore = "^1.8.7"
|
||||
lacuscore = "^1.8.8"
|
||||
pylacus = "^1.8.0"
|
||||
pyipasnhistory = "^2.1.2"
|
||||
publicsuffixlist = "^0.10.0.20240205"
|
||||
pyfaup = "^1.2"
|
||||
chardet = "^5.2.0"
|
||||
pysecuritytxt = "^1.2.2"
|
||||
pysecuritytxt = "^1.3.0"
|
||||
pylookyloomonitoring = "^1.1.3"
|
||||
pytz = {"version" = "^2024.1", python = "<3.9"}
|
||||
s3fs = "^2024.2.0"
|
||||
|
@ -98,7 +100,7 @@ types-redis = {version = "^4.6.0.20240218"}
|
|||
types-pkg-resources = "^0.1.3"
|
||||
types-Deprecated = "^1.2.9.20240106"
|
||||
types-python-dateutil = "^2.8.19.20240106"
|
||||
types-beautifulsoup4 = "^4.12.0.20240106"
|
||||
types-beautifulsoup4 = "^4.12.0.20240229"
|
||||
types-Pillow = "^10.2.0.20240213"
|
||||
types-pytz = "^2024.1.0.20240203"
|
||||
|
||||
|
|
|
@ -17,14 +17,16 @@ import time
|
|||
|
||||
import filetype # type: ignore[import-untyped]
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import date, datetime, timedelta, timezone
|
||||
from importlib.metadata import version
|
||||
from io import BytesIO, StringIO
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, TypedDict, Iterable
|
||||
from urllib.parse import quote_plus, unquote_plus, urlparse
|
||||
from uuid import uuid4
|
||||
from zipfile import ZipFile
|
||||
|
||||
from har2tree import HostNode, URLNode
|
||||
import flask_login # type: ignore[import-untyped]
|
||||
from flask import (Flask, Response, Request, flash, jsonify, redirect, render_template,
|
||||
request, send_file, url_for)
|
||||
|
@ -37,7 +39,8 @@ from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined]
|
|||
from werkzeug.security import check_password_hash
|
||||
from werkzeug.wrappers.response import Response as WerkzeugResponse
|
||||
|
||||
from lookyloo import Lookyloo, CaptureSettings
|
||||
from lookyloo import Lookyloo, CaptureSettings, Indexing
|
||||
from lookyloo.capturecache import CaptureCache
|
||||
from lookyloo.default import get_config
|
||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||
from lookyloo.helpers import get_taxonomies, UserAgents, load_cookies
|
||||
|
@ -262,6 +265,353 @@ def file_response(func): # type: ignore[no-untyped-def]
|
|||
return wrapper
|
||||
|
||||
|
||||
# ##### Methods querying the indexes #####
|
||||
|
||||
@functools.cache
|
||||
def get_indexing(user: User | None) -> Indexing:
|
||||
'''Depending if we're logged in or not, we (can) get different indexes:
|
||||
if index_everything is enabled, we have an index in kvrocks that contains all
|
||||
the indexes for all the captures.
|
||||
It is only accessible to the admin user.
|
||||
'''
|
||||
if not get_config('generic', 'index_everything'):
|
||||
return Indexing()
|
||||
|
||||
if not user or not user.is_authenticated:
|
||||
# No user or anonymous
|
||||
return Indexing()
|
||||
# Logged in user
|
||||
return Indexing(full_index=True)
|
||||
|
||||
|
||||
def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]:
|
||||
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
||||
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(body_hash, limit=-1)
|
||||
captures = []
|
||||
for capture_uuid, hostnode_uuid, hostname, _, url in details:
|
||||
cache = lookyloo.capture_cache(capture_uuid)
|
||||
if not cache:
|
||||
continue
|
||||
captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url))
|
||||
domains = get_indexing(flask_login.current_user).get_body_hash_domains(body_hash)
|
||||
return captures, domains
|
||||
|
||||
|
||||
def get_body_hash_full(body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]:
|
||||
'''Returns a lot of information about the hash (sha512) and the hits in the instance.
|
||||
Also contains the data (base64 encoded)'''
|
||||
details = get_indexing(flask_login.current_user).get_body_hash_urls(body_hash)
|
||||
|
||||
# Break immediately if we have the hash of the empty file
|
||||
if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e':
|
||||
return details, BytesIO()
|
||||
|
||||
# get the body from the first entry in the details list
|
||||
for _, entries in details.items():
|
||||
if not entries:
|
||||
continue
|
||||
ct = lookyloo.get_crawled_tree(entries[0]['capture'])
|
||||
try:
|
||||
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
|
||||
except Exception:
|
||||
# Unable to find URLnode in the tree, it probably has been rebuild.
|
||||
# TODO throw a log line or something
|
||||
# self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]')
|
||||
# lookyloo._captures_index.remove_pickle(<capture UUID>)
|
||||
continue
|
||||
|
||||
# From that point, we just try to get the content. Break as soon as we found one.
|
||||
if urlnode.body_hash == body_hash:
|
||||
# the hash we're looking for is the whole file
|
||||
return details, urlnode.body
|
||||
else:
|
||||
# The hash is an embedded resource
|
||||
for _, blobs in urlnode.embedded_ressources.items():
|
||||
for h, b in blobs:
|
||||
if h == body_hash:
|
||||
return details, b
|
||||
|
||||
# TODO: Couldn't find the file anywhere. Maybe return a warning in the file?
|
||||
return details, BytesIO()
|
||||
|
||||
|
||||
def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]:
|
||||
ct = lookyloo.get_crawled_tree(capture_uuid)
|
||||
to_return: dict[str, dict[str, URLNode | int]] = defaultdict()
|
||||
for node in ct.root_hartree.url_tree.traverse():
|
||||
if node.empty_response or node.body_hash in to_return:
|
||||
# If we have the same hash more than once, skip
|
||||
continue
|
||||
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(node.body_hash, limit=-1)
|
||||
# Note for future: mayeb get url, capture title, something better than just the hash to show to the user
|
||||
to_return[node.body_hash] = {'node': node, 'total_captures': total_captures}
|
||||
return to_return
|
||||
|
||||
|
||||
def get_latest_url_capture(url: str, /) -> CaptureCache | None:
|
||||
'''Get the most recent capture with this URL'''
|
||||
captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url))
|
||||
if captures:
|
||||
return captures[0]
|
||||
return None
|
||||
|
||||
|
||||
def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
|
||||
'''Get the most recent captures and URL nodes where the URL has been seen.'''
|
||||
captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url), cached_captures_only=cached_captures_only)
|
||||
|
||||
to_return: list[dict[str, Any]] = []
|
||||
for capture in captures[:limit]:
|
||||
ct = lookyloo.get_crawled_tree(capture.uuid)
|
||||
to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid,
|
||||
'start_timestamp': capture.timestamp.isoformat(),
|
||||
'title': capture.title}
|
||||
urlnodes: dict[str, dict[str, str]] = {}
|
||||
for urlnode in ct.root_hartree.url_tree.search_nodes(name=url):
|
||||
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
|
||||
'hostnode_uuid': urlnode.hostnode_uuid}
|
||||
if hasattr(urlnode, 'body_hash'):
|
||||
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
|
||||
to_append['urlnodes'] = urlnodes
|
||||
to_return.append(to_append)
|
||||
return to_return
|
||||
|
||||
|
||||
def get_hostname_occurrences(hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
|
||||
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
|
||||
captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_hostname(hostname), cached_captures_only=cached_captures_only)
|
||||
|
||||
to_return: list[dict[str, Any]] = []
|
||||
for capture in captures[:limit]:
|
||||
ct = lookyloo.get_crawled_tree(capture.uuid)
|
||||
to_append: dict[str, str | list[Any] | dict[str, Any]] = {
|
||||
'capture_uuid': capture.uuid,
|
||||
'start_timestamp': capture.timestamp.isoformat(),
|
||||
'title': capture.title}
|
||||
hostnodes: list[str] = []
|
||||
if with_urls_occurrences:
|
||||
urlnodes: dict[str, dict[str, str]] = {}
|
||||
for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname):
|
||||
hostnodes.append(hostnode.uuid)
|
||||
if with_urls_occurrences:
|
||||
for urlnode in hostnode.urls:
|
||||
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
|
||||
'url': urlnode.name,
|
||||
'hostnode_uuid': urlnode.hostnode_uuid}
|
||||
if hasattr(urlnode, 'body_hash'):
|
||||
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
|
||||
to_append['hostnodes'] = hostnodes
|
||||
if with_urls_occurrences:
|
||||
to_append['urlnodes'] = urlnodes
|
||||
to_return.append(to_append)
|
||||
return to_return
|
||||
|
||||
|
||||
def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]:
|
||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||
cached_captures = lookyloo.sorted_capture_cache([entry[0] for entry in get_indexing(flask_login.current_user).get_cookies_names_captures(cookie_name)])
|
||||
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
||||
domains = [(domain, freq, get_indexing(flask_login.current_user).cookies_names_domains_values(cookie_name, domain))
|
||||
for domain, freq in get_indexing(flask_login.current_user).get_cookie_domains(cookie_name)]
|
||||
return captures, domains
|
||||
|
||||
|
||||
def get_favicon_investigator(favicon_sha512: str,
|
||||
/,
|
||||
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
|
||||
tuple[str, str, str],
|
||||
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
|
||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512)])
|
||||
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
|
||||
favicon = get_indexing(flask_login.current_user).get_favicon(favicon_sha512)
|
||||
if favicon:
|
||||
mimetype = from_string(favicon, mime=True)
|
||||
b64_favicon = base64.b64encode(favicon).decode()
|
||||
mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon)
|
||||
else:
|
||||
mimetype = ''
|
||||
b64_favicon = ''
|
||||
mmh3_shodan = ''
|
||||
|
||||
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
|
||||
probabilistic_hash_algos = ['mmh3-shodan']
|
||||
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
|
||||
if get_probabilistic:
|
||||
for algo in probabilistic_hash_algos:
|
||||
probabilistic_favicons[algo] = {}
|
||||
for mm3hash in get_indexing(flask_login.current_user).get_probabilistic_hashes_favicon(algo, favicon_sha512):
|
||||
probabilistic_favicons[algo][mm3hash] = {}
|
||||
for sha512 in get_indexing(flask_login.current_user).get_hashes_favicon_probablistic(algo, mm3hash):
|
||||
if sha512 == favicon_sha512:
|
||||
# Skip entry if it is the same as the favicon we are investigating
|
||||
continue
|
||||
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
|
||||
if favicon:
|
||||
mimetype = from_string(favicon, mime=True)
|
||||
b64_favicon = base64.b64encode(favicon).decode()
|
||||
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
|
||||
if not probabilistic_favicons[algo][mm3hash]:
|
||||
# remove entry if it has no favicon
|
||||
probabilistic_favicons[algo].pop(mm3hash)
|
||||
if not probabilistic_favicons[algo]:
|
||||
# remove entry if it has no hash
|
||||
probabilistic_favicons.pop(algo)
|
||||
return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons
|
||||
|
||||
|
||||
def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
|
||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||
all_captures = dict(get_indexing(flask_login.current_user).get_http_headers_hashes_captures(hhh))
|
||||
if cached_captures := lookyloo.sorted_capture_cache([entry for entry in all_captures]):
|
||||
captures = []
|
||||
for cache in cached_captures:
|
||||
try:
|
||||
urlnode = lookyloo.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid])
|
||||
except Exception:
|
||||
# NOTE: print a logline
|
||||
# logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.')
|
||||
lookyloo._captures_index.remove_pickle(cache.uuid)
|
||||
continue
|
||||
captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title))
|
||||
# get the headers and format them as they were in the response
|
||||
urlnode = lookyloo.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid])
|
||||
headers = [(header["name"], header["value"]) for header in urlnode.response['headers']]
|
||||
return captures, headers
|
||||
return [], []
|
||||
|
||||
|
||||
def hash_lookup(blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
|
||||
'''Search all the captures a specific hash was seen.
|
||||
If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
|
||||
Capture UUID avoids duplicates on the same capture'''
|
||||
captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
|
||||
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1,
|
||||
prefered_uuids=set(lookyloo._captures_index.keys()))
|
||||
for h_capture_uuid, url_uuid, url_hostname, same_url, url in details:
|
||||
cache = lookyloo.capture_cache(h_capture_uuid)
|
||||
if cache and hasattr(cache, 'title'):
|
||||
if same_url:
|
||||
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
||||
else:
|
||||
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
||||
# Sort by timestamp by default
|
||||
captures_list['same_url'].sort(key=lambda y: y[3])
|
||||
captures_list['different_url'].sort(key=lambda y: y[3])
|
||||
return total_captures, captures_list
|
||||
|
||||
|
||||
def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[HostNode, list[dict[str, Any]]]:
|
||||
'''Gather all the informations needed to display the Hostnode investigator popup.'''
|
||||
|
||||
def normalize_known_content(h: str, /, known_content: dict[str, Any], url: URLNode) -> tuple[str | list[Any] | None, tuple[bool, Any] | None]:
|
||||
''' There are a few different sources to figure out known vs. legitimate content,
|
||||
this method normalize it for the web interface.'''
|
||||
known: str | list[Any] | None = None
|
||||
legitimate: tuple[bool, Any] | None = None
|
||||
if h not in known_content:
|
||||
return known, legitimate
|
||||
|
||||
if known_content[h]['type'] in ['generic', 'sanejs']:
|
||||
known = known_content[h]['details']
|
||||
elif known_content[h]['type'] == 'legitimate_on_domain':
|
||||
legit = False
|
||||
if url.hostname in known_content[h]['details']:
|
||||
legit = True
|
||||
legitimate = (legit, known_content[h]['details'])
|
||||
elif known_content[h]['type'] == 'malicious':
|
||||
legitimate = (False, known_content[h]['details'])
|
||||
|
||||
return known, legitimate
|
||||
|
||||
ct = lookyloo.get_crawled_tree(capture_uuid)
|
||||
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
||||
|
||||
known_content = lookyloo.context.find_known_content(hostnode)
|
||||
lookyloo.uwhois.query_whois_hostnode(hostnode)
|
||||
|
||||
urls: list[dict[str, Any]] = []
|
||||
for url in hostnode.urls:
|
||||
# For the popup, we need:
|
||||
# * https vs http
|
||||
# * everything after the domain
|
||||
# * the full URL
|
||||
to_append: dict[str, Any] = {
|
||||
'encrypted': url.name.startswith('https'),
|
||||
'url_path': url.name.split('/', 3)[-1],
|
||||
'url_object': url,
|
||||
}
|
||||
|
||||
if not url.empty_response:
|
||||
# Index lookup
|
||||
# %%% Full body %%%
|
||||
freq = get_indexing(flask_login.current_user).body_hash_fequency(url.body_hash)
|
||||
to_append['body_hash_details'] = freq
|
||||
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
|
||||
to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid)
|
||||
|
||||
# %%% Embedded ressources %%%
|
||||
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
|
||||
to_append['embedded_ressources'] = {}
|
||||
for mimetype, blobs in url.embedded_ressources.items():
|
||||
for h, blob in blobs:
|
||||
if h in to_append['embedded_ressources']:
|
||||
# Skip duplicates
|
||||
continue
|
||||
freq_embedded = get_indexing(flask_login.current_user).body_hash_fequency(h)
|
||||
to_append['embedded_ressources'][h] = freq_embedded
|
||||
to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes
|
||||
to_append['embedded_ressources'][h]['type'] = mimetype
|
||||
if freq_embedded['hash_freq'] > 1:
|
||||
to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid)
|
||||
for h in to_append['embedded_ressources'].keys():
|
||||
known, legitimate = normalize_known_content(h, known_content, url)
|
||||
if known:
|
||||
to_append['embedded_ressources'][h]['known_content'] = known
|
||||
elif legitimate:
|
||||
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
|
||||
|
||||
known, legitimate = normalize_known_content(url.body_hash, known_content, url)
|
||||
if known:
|
||||
to_append['known_content'] = known
|
||||
elif legitimate:
|
||||
to_append['legitimacy'] = legitimate
|
||||
|
||||
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
|
||||
if hasattr(url, 'cookies_sent'):
|
||||
to_display_sent: dict[str, set[Iterable[str | None]]] = defaultdict(set)
|
||||
for cookie, contexts in url.cookies_sent.items():
|
||||
if not contexts:
|
||||
# Locally created?
|
||||
to_display_sent[cookie].add(('Unknown origin', ))
|
||||
continue
|
||||
for context in contexts:
|
||||
to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid))
|
||||
to_append['cookies_sent'] = to_display_sent
|
||||
|
||||
# Optional: Cookies received from server in response -> map to nodes who send the cookie in request
|
||||
if hasattr(url, 'cookies_received'):
|
||||
to_display_received: dict[str, dict[str, set[Iterable[str | None]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)}
|
||||
for domain, c_received, is_3rd_party in url.cookies_received:
|
||||
if c_received not in ct.root_hartree.cookies_sent:
|
||||
# This cookie is never sent.
|
||||
if is_3rd_party:
|
||||
to_display_received['3rd_party'][c_received].add((domain, ))
|
||||
else:
|
||||
to_display_received['not_sent'][c_received].add((domain, ))
|
||||
continue
|
||||
|
||||
for url_node in ct.root_hartree.cookies_sent[c_received]:
|
||||
if is_3rd_party:
|
||||
to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
|
||||
else:
|
||||
to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
|
||||
to_append['cookies_received'] = to_display_received
|
||||
|
||||
urls.append(to_append)
|
||||
return hostnode, urls
|
||||
|
||||
|
||||
# ##### Hostnode level methods #####
|
||||
|
||||
@app.route('/tree/<string:tree_uuid>/host/<string:node_uuid>/hashes', methods=['GET'])
|
||||
|
@ -283,7 +633,7 @@ def urls_hostnode(tree_uuid: str, node_uuid: str) -> Response:
|
|||
@app.route('/tree/<string:tree_uuid>/host/<string:node_uuid>', methods=['GET'])
|
||||
def hostnode_popup(tree_uuid: str, node_uuid: str) -> str | WerkzeugResponse | Response:
|
||||
try:
|
||||
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
|
||||
hostnode, urls = get_hostnode_investigator(tree_uuid, node_uuid)
|
||||
except IndexError:
|
||||
return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.')
|
||||
|
||||
|
@ -850,8 +1200,8 @@ def tree_favicons(tree_uuid: str) -> str:
|
|||
continue
|
||||
mimetype = from_string(favicon, mime=True)
|
||||
favicon_sha512 = hashlib.sha512(favicon).hexdigest()
|
||||
frequency = lookyloo.indexing.favicon_frequency(favicon_sha512)
|
||||
number_captures = lookyloo.indexing.favicon_number_captures(favicon_sha512)
|
||||
frequency = get_indexing(flask_login.current_user).favicon_frequency(favicon_sha512)
|
||||
number_captures = get_indexing(flask_login.current_user).favicon_number_captures(favicon_sha512)
|
||||
b64_favicon = base64.b64encode(favicon).decode()
|
||||
mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon)
|
||||
favicons.append((favicon_sha512, frequency, number_captures, mimetype, b64_favicon, mmh3_shodan))
|
||||
|
@ -860,7 +1210,7 @@ def tree_favicons(tree_uuid: str) -> str:
|
|||
|
||||
@app.route('/tree/<string:tree_uuid>/body_hashes', methods=['GET'])
|
||||
def tree_body_hashes(tree_uuid: str) -> str:
|
||||
body_hashes = lookyloo.get_all_body_hashes(tree_uuid)
|
||||
body_hashes = get_all_body_hashes(tree_uuid)
|
||||
return render_template('tree_body_hashes.html', tree_uuid=tree_uuid, body_hashes=body_hashes)
|
||||
|
||||
|
||||
|
@ -958,27 +1308,27 @@ def index_hidden() -> str:
|
|||
|
||||
@app.route('/cookies', methods=['GET'])
|
||||
def cookies_lookup() -> str:
|
||||
cookies_names = [(name, freq, lookyloo.indexing.cookies_names_number_domains(name))
|
||||
for name, freq in lookyloo.indexing.cookies_names]
|
||||
cookies_names = [(name, freq, get_indexing(flask_login.current_user).cookies_names_number_domains(name))
|
||||
for name, freq in get_indexing(flask_login.current_user).cookies_names]
|
||||
return render_template('cookies.html', cookies_names=cookies_names)
|
||||
|
||||
|
||||
@app.route('/hhhashes', methods=['GET'])
|
||||
def hhhashes_lookup() -> str:
|
||||
hhhashes = [(hhh, freq, lookyloo.indexing.http_headers_hashes_number_captures(hhh))
|
||||
for hhh, freq in lookyloo.indexing.http_headers_hashes]
|
||||
hhhashes = [(hhh, freq, get_indexing(flask_login.current_user).http_headers_hashes_number_captures(hhh))
|
||||
for hhh, freq in get_indexing(flask_login.current_user).http_headers_hashes]
|
||||
return render_template('hhhashes.html', hhhashes=hhhashes)
|
||||
|
||||
|
||||
@app.route('/favicons', methods=['GET'])
|
||||
def favicons_lookup() -> str:
|
||||
favicons = []
|
||||
for sha512, freq in lookyloo.indexing.favicons:
|
||||
favicon = lookyloo.indexing.get_favicon(sha512)
|
||||
for sha512, freq in get_indexing(flask_login.current_user).favicons:
|
||||
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
|
||||
if not favicon:
|
||||
continue
|
||||
favicon_b64 = base64.b64encode(favicon).decode()
|
||||
nb_captures = lookyloo.indexing.favicon_number_captures(sha512)
|
||||
nb_captures = get_indexing(flask_login.current_user).favicon_number_captures(sha512)
|
||||
favicons.append((sha512, freq, nb_captures, favicon_b64))
|
||||
return render_template('favicons.html', favicons=favicons)
|
||||
|
||||
|
@ -986,10 +1336,10 @@ def favicons_lookup() -> str:
|
|||
@app.route('/ressources', methods=['GET'])
|
||||
def ressources() -> str:
|
||||
ressources = []
|
||||
for h, freq in lookyloo.indexing.ressources:
|
||||
domain_freq = lookyloo.indexing.ressources_number_domains(h)
|
||||
for h, freq in get_indexing(flask_login.current_user).ressources:
|
||||
domain_freq = get_indexing(flask_login.current_user).ressources_number_domains(h)
|
||||
context = lookyloo.context.find_known_content(h)
|
||||
capture_uuid, url_uuid, hostnode_uuid = lookyloo.indexing.get_hash_uuids(h)
|
||||
capture_uuid, url_uuid, hostnode_uuid = get_indexing(flask_login.current_user).get_hash_uuids(h)
|
||||
try:
|
||||
ressource = lookyloo.get_ressource(capture_uuid, url_uuid, h)
|
||||
except MissingUUID:
|
||||
|
@ -1003,7 +1353,7 @@ def ressources() -> str:
|
|||
|
||||
@app.route('/categories', methods=['GET'])
|
||||
def categories() -> str:
|
||||
return render_template('categories.html', categories=lookyloo.indexing.categories)
|
||||
return render_template('categories.html', categories=get_indexing(flask_login.current_user).categories)
|
||||
|
||||
|
||||
@app.route('/rebuild_all')
|
||||
|
@ -1057,7 +1407,7 @@ def recapture(tree_uuid: str) -> str | Response | WerkzeugResponse:
|
|||
@app.route('/ressource_by_hash/<string:sha512>', methods=['GET'])
|
||||
@file_response # type: ignore[misc]
|
||||
def ressource_by_hash(sha512: str) -> Response:
|
||||
details, body = lookyloo.get_body_hash_full(sha512)
|
||||
details, body = get_body_hash_full(sha512)
|
||||
return send_file(body, as_attachment=True, download_name='ressource.bin')
|
||||
|
||||
|
||||
|
@ -1245,13 +1595,13 @@ def capture_web() -> str | Response | WerkzeugResponse:
|
|||
|
||||
@app.route('/cookies/<string:cookie_name>', methods=['GET'])
|
||||
def cookies_name_detail(cookie_name: str) -> str:
|
||||
captures, domains = lookyloo.get_cookie_name_investigator(cookie_name.strip())
|
||||
captures, domains = get_cookie_name_investigator(cookie_name.strip())
|
||||
return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures)
|
||||
|
||||
|
||||
@app.route('/hhhdetails/<string:hhh>', methods=['GET'])
|
||||
def hhh_detail(hhh: str) -> str:
|
||||
captures, headers = lookyloo.get_hhh_investigator(hhh.strip())
|
||||
captures, headers = get_hhh_investigator(hhh.strip())
|
||||
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)
|
||||
|
||||
|
||||
|
@ -1259,7 +1609,7 @@ def hhh_detail(hhh: str) -> str:
|
|||
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
|
||||
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
|
||||
_get_prob = bool(get_probabilistic)
|
||||
captures, favicon, probabilistic_favicons = lookyloo.get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
|
||||
captures, favicon, probabilistic_favicons = get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
|
||||
mimetype, b64_favicon, mmh3_shodan = favicon
|
||||
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
|
||||
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon, mmh3_shodan=mmh3_shodan,
|
||||
|
@ -1269,20 +1619,20 @@ def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
|
|||
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
|
||||
def body_hash_details(body_hash: str) -> str:
|
||||
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
|
||||
captures, domains = lookyloo.get_body_hash_investigator(body_hash.strip())
|
||||
captures, domains = _get_body_hash_investigator(body_hash.strip())
|
||||
return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures, from_popup=from_popup)
|
||||
|
||||
|
||||
@app.route('/urls/<string:url>', methods=['GET'])
|
||||
def url_details(url: str) -> str:
|
||||
url = unquote_plus(url).strip()
|
||||
hits = lookyloo.get_url_occurrences(url, limit=50)
|
||||
hits = get_url_occurrences(url, limit=50)
|
||||
return render_template('url.html', url=url, hits=hits)
|
||||
|
||||
|
||||
@app.route('/hostnames/<string:hostname>', methods=['GET'])
|
||||
def hostname_details(hostname: str) -> str:
|
||||
hits = lookyloo.get_hostname_occurrences(hostname.strip(), with_urls_occurrences=True, limit=50)
|
||||
hits = get_hostname_occurrences(hostname.strip(), with_urls_occurrences=True, limit=50)
|
||||
return render_template('hostname.html', hostname=hostname, hits=hits)
|
||||
|
||||
|
||||
|
|
|
@ -287,7 +287,9 @@ class TriggerModules(Resource): # type: ignore[misc]
|
|||
params={'h': 'The hash (sha512)'})
|
||||
class HashInfo(Resource): # type: ignore[misc]
|
||||
def get(self, h: str) -> dict[str, Any] | tuple[dict[str, Any], int]:
|
||||
details, body = lookyloo.get_body_hash_full(h)
|
||||
from . import get_body_hash_full
|
||||
|
||||
details, body = get_body_hash_full(h)
|
||||
if not details:
|
||||
return {'error': 'Unknown Hash.'}, 400
|
||||
to_return: dict[str, Any] = {'response': {'hash': h, 'details': details,
|
||||
|
@ -308,8 +310,9 @@ class URLInfo(Resource): # type: ignore[misc]
|
|||
|
||||
@api.doc(body=url_info_fields) # type: ignore[misc]
|
||||
def post(self) -> list[dict[str, Any]]:
|
||||
from . import get_url_occurrences
|
||||
to_query: dict[str, Any] = request.get_json(force=True)
|
||||
occurrences = lookyloo.get_url_occurrences(to_query.pop('url'), **to_query)
|
||||
occurrences = get_url_occurrences(to_query.pop('url'), **to_query)
|
||||
return occurrences
|
||||
|
||||
|
||||
|
@ -326,8 +329,9 @@ class HostnameInfo(Resource): # type: ignore[misc]
|
|||
|
||||
@api.doc(body=hostname_info_fields) # type: ignore[misc]
|
||||
def post(self) -> list[dict[str, Any]]:
|
||||
from . import get_hostname_occurrences
|
||||
to_query: dict[str, Any] = request.get_json(force=True)
|
||||
return lookyloo.get_hostname_occurrences(to_query.pop('hostname'), **to_query)
|
||||
return get_hostname_occurrences(to_query.pop('hostname'), **to_query)
|
||||
|
||||
|
||||
@api.route('/json/stats')
|
||||
|
|
Loading…
Reference in New Issue