new: Indexer for *all* the captures

pull/893/head
Raphaël Vinot 2024-03-05 20:51:21 +01:00
parent 2bbd35c0b8
commit e45b7c4346
14 changed files with 1450 additions and 568 deletions

View File

@ -5,15 +5,10 @@ from __future__ import annotations
import logging
import logging.config
import os
import shutil
from datetime import datetime, timedelta
from pathlib import Path
from lookyloo import Lookyloo
from lookyloo import Lookyloo, Indexing
from lookyloo.default import AbstractManager, get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list
from lookyloo.exceptions import NoValidHarFile
logging.config.dictConfig(get_config('logging'))
@ -21,125 +16,39 @@ logging.config.dictConfig(get_config('logging'))
class BackgroundIndexer(AbstractManager):
def __init__(self, loglevel: int | None=None):
def __init__(self, full: bool=False, loglevel: int | None=None):
super().__init__(loglevel)
self.lookyloo = Lookyloo()
self.script_name = 'background_indexer'
self.full_indexer = full
self.indexing = Indexing(full_index=self.full_indexer)
if self.full_indexer:
self.script_name = 'background_full_indexer'
else:
self.script_name = 'background_indexer'
# make sure discarded captures dir exists
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
def _to_run_forever(self) -> None:
all_done = self._build_missing_pickles()
if all_done:
self._check_indexes()
# Disable probabilistic indexing for now, mmh3 isn't a fuzzy hash ago.
# self._check_probabilistic_indexes()
self._check_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50
got_new_captures = False
# Initialize time where we do not want to build the pickles anymore.
archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval)
for month_dir in make_dirs_list(self.lookyloo.capture_dir):
__counter_shutdown = 0
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
__counter_shutdown += 1
if __counter_shutdown % 10 and self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.')
return False
if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
# We already have a pickle file
self.logger.debug(f'{path} has a pickle.')
continue
if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
# No HAR file
self.logger.debug(f'{path} has no HAR file.')
continue
if is_locked(path):
# it is really locked
self.logger.debug(f'{path} is locked, pickle generated by another process.')
continue
with (path / 'uuid').open() as f:
uuid = f.read()
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
if cached_path != path:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
try:
self.logger.info(f'Build pickle for {uuid}: {path.name}')
self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
self.logger.info(f'Pickle for {uuid} built.')
got_new_captures = True
max_captures -= 1
except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
# The capture is not working, moving it away.
try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
self.lookyloo.redis.hdel('lookup_dirs', uuid)
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
if max_captures <= 0:
self.logger.info('Too many captures in the backlog, start from the beginning.')
return False
if got_new_captures:
self.logger.info('Finished building all missing pickles.')
# Only return True if we built new pickles.
return True
return False
def _check_indexes(self) -> None:
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_indexing', 1, ex=3600, nx=True)
if not can_index:
if not self.indexing.can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.')
return None
self.logger.info('Check indexes...')
self.logger.info(f'Check {self.script_name}...')
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
if not cache.tree_ready:
# pickle isn't ready, we can't index.
continue
p = index_redis.pipeline()
p.sismember('indexed_urls', cache.uuid)
p.sismember('indexed_body_hashes', cache.uuid)
p.sismember('indexed_cookies', cache.uuid)
p.sismember('indexed_hhhashes', cache.uuid)
p.sismember('indexed_favicons', cache.uuid)
indexed = p.execute()
indexed = self.indexing.capture_indexed(cache.uuid)
if all(indexed):
continue
try:
@ -151,50 +60,23 @@ class BackgroundIndexer(AbstractManager):
if not indexed[0]:
self.logger.info(f'Indexing urls for {cache.uuid}')
self.lookyloo.indexing.index_url_capture(ct)
self.indexing.index_url_capture(ct)
if not indexed[1]:
self.logger.info(f'Indexing resources for {cache.uuid}')
self.lookyloo.indexing.index_body_hashes_capture(ct)
self.indexing.index_body_hashes_capture(ct)
if not indexed[2]:
self.logger.info(f'Indexing cookies for {cache.uuid}')
self.lookyloo.indexing.index_cookies_capture(ct)
self.indexing.index_cookies_capture(ct)
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
self.indexing.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {cache.uuid}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons)
self.indexing.index_favicons_capture(cache.uuid, favicons)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
index_redis.delete('ongoing_indexing')
self.logger.info('... done.')
def _check_probabilistic_indexes(self) -> None:
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
if not can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Probalistic indexing already ongoing in another process.')
return None
self.logger.info('Check probabilistic indexes...')
algorithms = ['mmh3-shodan']
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
p = index_redis.pipeline()
for algorithm in algorithms:
p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
indexed = p.execute()
if all(indexed):
continue
for i, algorithm in enumerate(algorithms):
if not indexed[i]:
self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
index_redis.delete('ongoing_probalistic_indexing')
self.indexing.indexing_done()
self.logger.info('... done.')
@ -203,5 +85,12 @@ def main() -> None:
i.run(sleep_in_sec=60)
def main_full_indexer() -> None:
if not get_config('generic', 'index_everything'):
raise Exception('Full indexer is disabled.')
i = BackgroundIndexer(full=True)
i.run(sleep_in_sec=60)
if __name__ == '__main__':
main()

View File

@ -11,7 +11,7 @@ from subprocess import Popen
from redis import Redis
from redis.exceptions import ConnectionError
from lookyloo.default import get_homedir, get_socket_path
from lookyloo.default import get_homedir, get_socket_path, get_config
def check_running(name: str) -> bool:
@ -55,13 +55,32 @@ def shutdown_indexing(storage_directory: Path | None=None) -> None:
print('Redis indexing database shutdown.')
def launch_full_index(storage_directory: Path | None=None) -> None:
if not storage_directory:
storage_directory = get_homedir()
if not check_running('full_index'):
Popen(["./run_kvrocks.sh"], cwd=(storage_directory / 'full_index'))
def shutdown_full_index(storage_directory: Path | None=None) -> None:
if not storage_directory:
storage_directory = get_homedir()
r = Redis(unix_socket_path=get_socket_path('full_index'))
r.shutdown(save=True)
print('Kvrocks full indexing database shutdown.')
def launch_all() -> None:
launch_cache()
launch_indexing()
if get_config('generic', 'index_everything'):
launch_full_index()
def check_all(stop: bool=False) -> None:
backends: dict[str, bool] = {'cache': False, 'indexing': False}
if get_config('generic', 'index_everything'):
backends['full_index'] = False
while True:
for db_name in backends.keys():
try:
@ -85,6 +104,8 @@ def check_all(stop: bool=False) -> None:
def stop_all() -> None:
shutdown_cache()
shutdown_indexing()
if get_config('generic', 'index_everything'):
shutdown_full_index()
def main() -> None:

View File

@ -2,7 +2,7 @@
from subprocess import Popen, run
from lookyloo.default import get_homedir
from lookyloo.default import get_homedir, get_config
def main() -> None:
@ -18,9 +18,16 @@ def main() -> None:
print('Start asynchronous ingestor...')
Popen(['async_capture'])
print('done.')
print('Start background capture builder...')
Popen(['background_build_captures'])
print('done.')
print('Start background indexer...')
Popen(['background_indexer'])
print('done.')
if get_config('generic', 'index_everything'):
print('Start background full indexer...')
Popen(['background_full_indexer'])
print('done.')
print('Start background processing...')
Popen(['processing'])
print('done.')

View File

@ -79,6 +79,7 @@
"bucket_name": ""
}
},
"index_everything": false,
"_notes": {
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
@ -110,6 +111,7 @@
"archive": "The captures older than this value (in days) will be archived. They're not cached by default in the Lookyloo class.",
"max_capture_time": "The very maximal time we allow a capture to keep going. Should only be triggered by captures that cause playwright to never quit.",
"max_tree_create_time": "The max time the generation of a tree is allowed to take",
"s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage."
"s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage.",
"index_everything": "If true, index every capture, even if it's not public. This feature requires a dedicated kvrocks instance, and is only accessible when logged-in as admin."
}
}

875
full_index/kvrocks.conf Normal file
View File

@ -0,0 +1,875 @@
################################ GENERAL #####################################
# By default kvrocks listens for connections from localhost interface.
# It is possible to listen to just one or multiple interfaces using
# the "bind" configuration directive, followed by one or more IP addresses.
#
# Examples:
#
# bind 192.168.1.100 10.0.0.1
# bind 127.0.0.1 ::1
# bind 0.0.0.0
# bind 127.0.0.1
# Unix socket.
#
# Specify the path for the unix socket that will be used to listen for
# incoming connections. There is no default, so kvrocks will not listen
# on a unix socket when not specified.
#
unixsocket full_index.sock
unixsocketperm 777
# Accept connections on the specified port, default is 6666.
# port 6666
# Close the connection after a client is idle for N seconds (0 to disable)
timeout 0
# The number of worker's threads, increase or decrease would affect the performance.
workers 8
# By default, kvrocks does not run as a daemon. Use 'yes' if you need it.
# Note that kvrocks will write a PID file in /var/run/kvrocks.pid when daemonized
daemonize yes
# Kvrocks implements the cluster solution that is similar to the Redis cluster solution.
# You can get cluster information by CLUSTER NODES|SLOTS|INFO command, it also is
# adapted to redis-cli, redis-benchmark, Redis cluster SDK, and Redis cluster proxy.
# But kvrocks doesn't support communicating with each other, so you must set
# cluster topology by CLUSTER SETNODES|SETNODEID commands, more details: #219.
#
# PLEASE NOTE:
# If you enable cluster, kvrocks will encode key with its slot id calculated by
# CRC16 and modulo 16384, encoding key with its slot id makes it efficient to
# migrate keys based on the slot. So if you enabled at first time, cluster mode must
# not be disabled after restarting, and vice versa. That is to say, data is not
# compatible between standalone mode with cluster mode, you must migrate data
# if you want to change mode, otherwise, kvrocks will make data corrupt.
#
# Default: no
cluster-enabled no
# By default, namespaces are stored in the configuration file and won't be replicated
# to replicas. This option allows to change this behavior, so that namespaces are also
# propagated to slaves. Note that:
# 1) it won't replicate the 'masterauth' to prevent breaking master/replica replication
# 2) it will overwrite replica's namespace with master's namespace, so be careful of in-using namespaces
# 3) cannot switch off the namespace replication once it's enabled
#
# Default: no
repl-namespace-enabled no
# Persist the cluster nodes topology in local file($dir/nodes.conf). This configuration
# takes effect only if the cluster mode was enabled.
#
# If yes, it will try to load the cluster topology from the local file when starting,
# and dump the cluster nodes into the file if it was changed.
#
# Default: yes
persist-cluster-nodes-enabled yes
# Set the max number of connected clients at the same time. By default
# this limit is set to 10000 clients. However, if the server is not
# able to configure the process file limit to allow for the specified limit
# the max number of allowed clients is set to the current file limit
#
# Once the limit is reached the server will close all the new connections sending
# an error 'max number of clients reached'.
#
maxclients 10000
# Require clients to issue AUTH <PASSWORD> before processing any other
# commands. This might be useful in environments in which you do not trust
# others with access to the host running kvrocks.
#
# This should stay commented out for backward compatibility and because most
# people do not need auth (e.g. they run their own servers).
#
# Warning: since kvrocks is pretty fast an outside user can try up to
# 150k passwords per second against a good box. This means that you should
# use a very strong password otherwise it will be very easy to break.
#
# requirepass foobared
# If the master is password protected (using the "masterauth" configuration
# directive below) it is possible to tell the slave to authenticate before
# starting the replication synchronization process. Otherwise, the master will
# refuse the slave request.
#
# masterauth foobared
# Master-Salve replication would check db name is matched. if not, the slave should
# refuse to sync the db from master. Don't use the default value, set the db-name to identify
# the cluster.
db-name change.me.db
# The working directory
#
# The DB will be written inside this directory
# Note that you must specify a directory here, not a file name.
dir ./
# You can configure where to store your server logs by the log-dir.
# If you don't specify one, we will use the above `dir` as our default log directory.
# We also can send logs to stdout/stderr is as simple as:
#
log-dir stdout
# Log level
# Possible values: info, warning, error, fatal
# Default: info
log-level info
# You can configure log-retention-days to control whether to enable the log cleaner
# and the maximum retention days that the INFO level logs will be kept.
#
# if set to -1, that means to disable the log cleaner.
# if set to 0, all previous INFO level logs will be immediately removed.
# if set to between 0 to INT_MAX, that means it will retent latest N(log-retention-days) day logs.
# By default the log-retention-days is -1.
log-retention-days -1
# When running in daemonize mode, kvrocks writes a PID file in ${CONFIG_DIR}/kvrocks.pid by
# default. You can specify a custom pid file location here.
pidfile kvrocks.pid
# You can configure a slave instance to accept writes or not. Writing against
# a slave instance may be useful to store some ephemeral data (because data
# written on a slave will be easily deleted after resync with the master) but
# may also cause problems if clients are writing to it because of a
# misconfiguration.
slave-read-only yes
# The slave priority is an integer number published by Kvrocks in the INFO output.
# It is used by Redis Sentinel in order to select a slave to promote into a
# master if the master is no longer working correctly.
#
# A slave with a low priority number is considered better for promotion, so
# for instance if there are three slave with priority 10, 100, 25 Sentinel will
# pick the one with priority 10, that is the lowest.
#
# However a special priority of 0 marks the replica as not able to perform the
# role of master, so a slave with priority of 0 will never be selected by
# Redis Sentinel for promotion.
#
# By default the priority is 100.
slave-priority 100
# TCP listen() backlog.
#
# In high requests-per-second environments you need an high backlog in order
# to avoid slow clients connections issues. Note that the Linux kernel
# will silently truncate it to the value of /proc/sys/net/core/somaxconn so
# make sure to raise both the value of somaxconn and tcp_max_syn_backlog
# in order to Get the desired effect.
tcp-backlog 511
# If the master is an old version, it may have specified replication threads
# that use 'port + 1' as listening port, but in new versions, we don't use
# extra port to implement replication. In order to allow the new replicas to
# copy old masters, you should indicate that the master uses replication port
# or not.
# If yes, that indicates master uses replication port and replicas will connect
# to 'master's listening port + 1' when synchronization.
# If no, that indicates master doesn't use replication port and replicas will
# connect 'master's listening port' when synchronization.
master-use-repl-port no
# Currently, master only checks sequence number when replica asks for PSYNC,
# that is not enough since they may have different replication histories even
# the replica asking sequence is in the range of the master current WAL.
#
# We design 'Replication Sequence ID' PSYNC, we add unique replication id for
# every write batch (the operation of each command on the storage engine), so
# the combination of replication id and sequence is unique for write batch.
# The master can identify whether the replica has the same replication history
# by checking replication id and sequence.
#
# By default, it is not enabled since this stricter check may easily lead to
# full synchronization.
use-rsid-psync no
# Master-Slave replication. Use slaveof to make a kvrocks instance a copy of
# another kvrocks server. A few things to understand ASAP about kvrocks replication.
#
# 1) Kvrocks replication is asynchronous, but you can configure a master to
# stop accepting writes if it appears to be not connected with at least
# a given number of slaves.
# 2) Kvrocks slaves are able to perform a partial resynchronization with the
# master if the replication link is lost for a relatively small amount of
# time. You may want to configure the replication backlog size (see the next
# sections of this file) with a sensible value depending on your needs.
# 3) Replication is automatic and does not need user intervention. After a
# network partition slaves automatically try to reconnect to masters
# and resynchronize with them.
#
# slaveof <masterip> <masterport>
# slaveof 127.0.0.1 6379
# When a slave loses its connection with the master, or when the replication
# is still in progress, the slave can act in two different ways:
#
# 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will
# still reply to client requests, possibly with out-of-date data, or the
# data set may just be empty if this is the first synchronization.
#
# 2) if slave-serve-stale-data is set to 'no' the slave will reply with
# an error "SYNC with master in progress" to all kinds of commands
# but to INFO and SLAVEOF.
#
slave-serve-stale-data yes
# To guarantee slave's data safe and serve when it is in full synchronization
# state, slave still keep itself data. But this way needs to occupy much disk
# space, so we provide a way to reduce disk occupation, slave will delete itself
# entire database before fetching files from master during full synchronization.
# If you want to enable this way, you can set 'slave-delete-db-before-fullsync'
# to yes, but you must know that database will be lost if master is down during
# full synchronization, unless you have a backup of database.
#
# This option is similar redis replicas RDB diskless load option:
# repl-diskless-load on-empty-db
#
# Default: no
slave-empty-db-before-fullsync no
# A Kvrocks master is able to list the address and port of the attached
# replicas in different ways. For example the "INFO replication" section
# offers this information, which is used, among other tools, by
# Redis Sentinel in order to discover replica instances.
# Another place where this info is available is in the output of the
# "ROLE" command of a master.
#
# The listed IP address and port normally reported by a replica is
# obtained in the following way:
#
# IP: The address is auto detected by checking the peer address
# of the socket used by the replica to connect with the master.
#
# Port: The port is communicated by the replica during the replication
# handshake, and is normally the port that the replica is using to
# listen for connections.
#
# However when port forwarding or Network Address Translation (NAT) is
# used, the replica may actually be reachable via different IP and port
# pairs. The following two options can be used by a replica in order to
# report to its master a specific set of IP and port, so that both INFO
# and ROLE will report those values.
#
# There is no need to use both the options if you need to override just
# the port or the IP address.
#
# replica-announce-ip 5.5.5.5
# replica-announce-port 1234
# If replicas need full synchronization with master, master need to create
# checkpoint for feeding replicas, and replicas also stage a checkpoint of
# the master. If we also keep the backup, it maybe occupy extra disk space.
# You can enable 'purge-backup-on-fullsync' if disk is not sufficient, but
# that may cause remote backup copy failing.
#
# Default: no
purge-backup-on-fullsync no
# The maximum allowed rate (in MB/s) that should be used by replication.
# If the rate exceeds max-replication-mb, replication will slow down.
# Default: 0 (i.e. no limit)
max-replication-mb 0
# The maximum allowed aggregated write rate of flush and compaction (in MB/s).
# If the rate exceeds max-io-mb, io will slow down.
# 0 is no limit
# Default: 0
max-io-mb 0
# The maximum allowed space (in GB) that should be used by RocksDB.
# If the total size of the SST files exceeds max_allowed_space, writes to RocksDB will fail.
# Please see: https://github.com/facebook/rocksdb/wiki/Managing-Disk-Space-Utilization
# Default: 0 (i.e. no limit)
max-db-size 0
# The maximum backup to keep, server cron would run every minutes to check the num of current
# backup, and purge the old backup if exceed the max backup num to keep. If max-backup-to-keep
# is 0, no backup would be kept. But now, we only support 0 or 1.
max-backup-to-keep 1
# The maximum hours to keep the backup. If max-backup-keep-hours is 0, wouldn't purge any backup.
# default: 1 day
max-backup-keep-hours 24
# max-bitmap-to-string-mb use to limit the max size of bitmap to string transformation(MB).
#
# Default: 16
max-bitmap-to-string-mb 16
# Whether to enable SCAN-like cursor compatible with Redis.
# If enabled, the cursor will be unsigned 64-bit integers.
# If disabled, the cursor will be a string.
# Default: no
redis-cursor-compatible no
# Maximum nesting depth allowed when parsing and serializing
# JSON documents while using JSON commands like JSON.SET.
# Default: 1024
json-max-nesting-depth 1024
# The underlying storage format of JSON data type
# NOTE: This option only affects newly written/updated key-values
# The CBOR format may reduce the storage size and speed up JSON commands
# Available values: json, cbor
# Default: json
json-storage-format json
################################## TLS ###################################
# By default, TLS/SSL is disabled, i.e. `tls-port` is set to 0.
# To enable it, `tls-port` can be used to define TLS-listening ports.
# tls-port 0
# Configure a X.509 certificate and private key to use for authenticating the
# server to connected clients, masters or cluster peers.
# These files should be PEM formatted.
#
# tls-cert-file kvrocks.crt
# tls-key-file kvrocks.key
# If the key file is encrypted using a passphrase, it can be included here
# as well.
#
# tls-key-file-pass secret
# Configure a CA certificate(s) bundle or directory to authenticate TLS/SSL
# clients and peers. Kvrocks requires an explicit configuration of at least one
# of these, and will not implicitly use the system wide configuration.
#
# tls-ca-cert-file ca.crt
# tls-ca-cert-dir /etc/ssl/certs
# By default, clients on a TLS port are required
# to authenticate using valid client side certificates.
#
# If "no" is specified, client certificates are not required and not accepted.
# If "optional" is specified, client certificates are accepted and must be
# valid if provided, but are not required.
#
# tls-auth-clients no
# tls-auth-clients optional
# By default, only TLSv1.2 and TLSv1.3 are enabled and it is highly recommended
# that older formally deprecated versions are kept disabled to reduce the attack surface.
# You can explicitly specify TLS versions to support.
# Allowed values are case insensitive and include "TLSv1", "TLSv1.1", "TLSv1.2",
# "TLSv1.3" (OpenSSL >= 1.1.1) or any combination.
# To enable only TLSv1.2 and TLSv1.3, use:
#
# tls-protocols "TLSv1.2 TLSv1.3"
# Configure allowed ciphers. See the ciphers(1ssl) manpage for more information
# about the syntax of this string.
#
# Note: this configuration applies only to <= TLSv1.2.
#
# tls-ciphers DEFAULT:!MEDIUM
# Configure allowed TLSv1.3 ciphersuites. See the ciphers(1ssl) manpage for more
# information about the syntax of this string, and specifically for TLSv1.3
# ciphersuites.
#
# tls-ciphersuites TLS_CHACHA20_POLY1305_SHA256
# When choosing a cipher, use the server's preference instead of the client
# preference. By default, the server follows the client's preference.
#
# tls-prefer-server-ciphers yes
# By default, TLS session caching is enabled to allow faster and less expensive
# reconnections by clients that support it. Use the following directive to disable
# caching.
#
# tls-session-caching no
# Change the default number of TLS sessions cached. A zero value sets the cache
# to unlimited size. The default size is 20480.
#
# tls-session-cache-size 5000
# Change the default timeout of cached TLS sessions. The default timeout is 300
# seconds.
#
# tls-session-cache-timeout 60
# By default, a replica does not attempt to establish a TLS connection
# with its master.
#
# Use the following directive to enable TLS on replication links.
#
# tls-replication yes
################################## SLOW LOG ###################################
# The Kvrocks Slow Log is a mechanism to log queries that exceeded a specified
# execution time. The execution time does not include the I/O operations
# like talking with the client, sending the reply and so forth,
# but just the time needed to actually execute the command (this is the only
# stage of command execution where the thread is blocked and can not serve
# other requests in the meantime).
#
# You can configure the slow log with two parameters: one tells Kvrocks
# what is the execution time, in microseconds, to exceed in order for the
# command to get logged, and the other parameter is the length of the
# slow log. When a new command is logged the oldest one is removed from the
# queue of logged commands.
# The following time is expressed in microseconds, so 1000000 is equivalent
# to one second. Note that -1 value disables the slow log, while
# a value of zero forces the logging of every command.
slowlog-log-slower-than 100000
# There is no limit to this length. Just be aware that it will consume memory.
# You can reclaim memory used by the slow log with SLOWLOG RESET.
slowlog-max-len 128
# If you run kvrocks from upstart or systemd, kvrocks can interact with your
# supervision tree. Options:
# supervised no - no supervision interaction
# supervised upstart - signal upstart by putting kvrocks into SIGSTOP mode
# supervised systemd - signal systemd by writing READY=1 to $NOTIFY_SOCKET
# supervised auto - detect upstart or systemd method based on
# UPSTART_JOB or NOTIFY_SOCKET environment variables
# Note: these supervision methods only signal "process is ready."
# They do not enable continuous liveness pings back to your supervisor.
supervised no
################################## PERF LOG ###################################
# The Kvrocks Perf Log is a mechanism to log queries' performance context that
# exceeded a specified execution time. This mechanism uses rocksdb's
# Perf Context and IO Stats Context, Please see:
# https://github.com/facebook/rocksdb/wiki/Perf-Context-and-IO-Stats-Context
#
# This mechanism is enabled when profiling-sample-commands is not empty and
# profiling-sample-ratio greater than 0.
# It is important to note that this mechanism affects performance, but it is
# useful for troubleshooting performance bottlenecks, so it should only be
# enabled when performance problems occur.
# The name of the commands you want to record. Must be original name of
# commands supported by Kvrocks. Use ',' to separate multiple commands and
# use '*' to record all commands supported by Kvrocks.
# Example:
# - Single command: profiling-sample-commands get
# - Multiple commands: profiling-sample-commands get,mget,hget
#
# Default: empty
# profiling-sample-commands ""
# Ratio of the samples would be recorded. It is a number between 0 and 100.
# We simply use the rand to determine whether to record the sample or not.
#
# Default: 0
profiling-sample-ratio 0
# There is no limit to this length. Just be aware that it will consume memory.
# You can reclaim memory used by the perf log with PERFLOG RESET.
#
# Default: 256
profiling-sample-record-max-len 256
# profiling-sample-record-threshold-ms use to tell the kvrocks when to record.
#
# Default: 100 millisecond
profiling-sample-record-threshold-ms 100
################################## CRON ###################################
# Compact Scheduler, auto compact at schedule time
# time expression format is the same as crontab(currently only support * and int)
# e.g. compact-cron 0 3 * * * 0 4 * * *
# would compact the db at 3am and 4am everyday
# compact-cron 0 3 * * *
# The hour range that compaction checker would be active
# e.g. compaction-checker-range 0-7 means compaction checker would be worker between
# 0-7am every day.
compaction-checker-range 0-7
# When the compaction checker is triggered, the db will periodically pick the SST file
# with the highest "deleted percentage" (i.e. the percentage of deleted keys in the SST
# file) to compact, in order to free disk space.
# However, if a specific SST file was created more than "force-compact-file-age" seconds
# ago, and its percentage of deleted keys is higher than
# "force-compact-file-min-deleted-percentage", it will be forcely compacted as well.
# Default: 172800 seconds; Range: [60, INT64_MAX];
# force-compact-file-age 172800
# Default: 10 %; Range: [1, 100];
# force-compact-file-min-deleted-percentage 10
# Bgsave scheduler, auto bgsave at scheduled time
# time expression format is the same as crontab(currently only support * and int)
# e.g. bgsave-cron 0 3 * * * 0 4 * * *
# would bgsave the db at 3am and 4am every day
# Command renaming.
#
# It is possible to change the name of dangerous commands in a shared
# environment. For instance, the KEYS command may be renamed into something
# hard to guess so that it will still be available for internal-use tools
# but not available for general clients.
#
# Example:
#
# rename-command KEYS b840fc02d524045429941cc15f59e41cb7be6c52
#
# It is also possible to completely kill a command by renaming it into
# an empty string:
#
# rename-command KEYS ""
################################ MIGRATE #####################################
# If the network bandwidth is completely consumed by the migration task,
# it will affect the availability of kvrocks. To avoid this situation,
# migrate-speed is adopted to limit the migrating speed.
# Migrating speed is limited by controlling the duration between sending data,
# the duration is calculated by: 1000000 * migrate-pipeline-size / migrate-speed (us).
# Value: [0,INT_MAX], 0 means no limit
#
# Default: 4096
migrate-speed 4096
# In order to reduce data transmission times and improve the efficiency of data migration,
# pipeline is adopted to send multiple data at once. Pipeline size can be set by this option.
# Value: [1, INT_MAX], it can't be 0
#
# Default: 16
migrate-pipeline-size 16
# In order to reduce the write forbidden time during migrating slot, we will migrate the incremental
# data several times to reduce the amount of incremental data. Until the quantity of incremental
# data is reduced to a certain threshold, slot will be forbidden write. The threshold is set by
# this option.
# Value: [1, INT_MAX], it can't be 0
#
# Default: 10000
migrate-sequence-gap 10000
################################ ROCKSDB #####################################
# Specify the capacity of column family block cache. A larger block cache
# may make requests faster while more keys would be cached. Max Size is 400*1024.
# Default: 4096MB
rocksdb.block_cache_size 4096
# A global cache for table-level rows in RocksDB. If almost always point
# lookups, enlarging row cache may improve read performance. Otherwise,
# if we enlarge this value, we can lessen metadata/subkey block cache size.
#
# Default: 0 (disabled)
rocksdb.row_cache_size 0
# Number of open files that can be used by the DB. You may need to
# increase this if your database has a large working set. Value -1 means
# files opened are always kept open. You can estimate number of files based
# on target_file_size_base and target_file_size_multiplier for level-based
# compaction. For universal-style compaction, you can usually set it to -1.
# Default: 8096
rocksdb.max_open_files 8096
# Amount of data to build up in memory (backed by an unsorted log
# on disk) before converting to a sorted on-disk file.
#
# Larger values increase performance, especially during bulk loads.
# Up to max_write_buffer_number write buffers may be held in memory
# at the same time,
# so you may wish to adjust this parameter to control memory usage.
# Also, a larger write buffer will result in a longer recovery time
# the next time the database is opened.
#
# Note that write_buffer_size is enforced per column family.
# See db_write_buffer_size for sharing memory across column families.
# default is 64MB
rocksdb.write_buffer_size 64
# Target file size for compaction, target file size for Level N can be calculated
# by target_file_size_base * (target_file_size_multiplier ^ (L-1))
#
# Default: 128MB
rocksdb.target_file_size_base 128
# The maximum number of write buffers that are built up in memory.
# The default and the minimum number is 2, so that when 1 write buffer
# is being flushed to storage, new writes can continue to the other
# write buffer.
# If max_write_buffer_number > 3, writing will be slowed down to
# options.delayed_write_rate if we are writing to the last write buffer
# allowed.
rocksdb.max_write_buffer_number 4
# Maximum number of concurrent background jobs (compactions and flushes).
# For backwards compatibility we will set `max_background_jobs =
# max_background_compactions + max_background_flushes` in the case where user
# sets at least one of `max_background_compactions` or `max_background_flushes`
# (we replace -1 by 1 in case one option is unset).
rocksdb.max_background_jobs 4
# DEPRECATED: it is automatically decided based on the value of rocksdb.max_background_jobs
# Maximum number of concurrent background compaction jobs, submitted to
# the default LOW priority thread pool.
rocksdb.max_background_compactions -1
# DEPRECATED: it is automatically decided based on the value of rocksdb.max_background_jobs
# Maximum number of concurrent background memtable flush jobs, submitted by
# default to the HIGH priority thread pool. If the HIGH priority thread pool
# is configured to have zero threads, flush jobs will share the LOW priority
# thread pool with compaction jobs.
rocksdb.max_background_flushes -1
# This value represents the maximum number of threads that will
# concurrently perform a compaction job by breaking it into multiple,
# smaller ones that are run simultaneously.
# Default: 2
rocksdb.max_sub_compactions 2
# In order to limit the size of WALs, RocksDB uses DBOptions::max_total_wal_size
# as the trigger of column family flush. Once WALs exceed this size, RocksDB
# will start forcing the flush of column families to allow deletion of some
# oldest WALs. This config can be useful when column families are updated at
# non-uniform frequencies. If there's no size limit, users may need to keep
# really old WALs when the infrequently-updated column families hasn't flushed
# for a while.
#
# In kvrocks, we use multiple column families to store metadata, subkeys, etc.
# If users always use string type, but use list, hash and other complex data types
# infrequently, there will be a lot of old WALs if we don't set size limit
# (0 by default in rocksdb), because rocksdb will dynamically choose the WAL size
# limit to be [sum of all write_buffer_size * max_write_buffer_number] * 4 if set to 0.
#
# Moreover, you should increase this value if you already set rocksdb.write_buffer_size
# to a big value, to avoid influencing the effect of rocksdb.write_buffer_size and
# rocksdb.max_write_buffer_number.
#
# default is 512MB
rocksdb.max_total_wal_size 512
# We implement the replication with rocksdb WAL, it would trigger full sync when the seq was out of range.
# wal_ttl_seconds and wal_size_limit_mb would affect how archived logs will be deleted.
# If WAL_ttl_seconds is not 0, then WAL files will be checked every WAL_ttl_seconds / 2 and those that
# are older than WAL_ttl_seconds will be deleted#
#
# Default: 3 Hours
rocksdb.wal_ttl_seconds 10800
# If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
# WAL files will be checked every 10 min and if total size is greater
# then WAL_size_limit_MB, they will be deleted starting with the
# earliest until size_limit is met. All empty files will be deleted
# Default: 16GB
rocksdb.wal_size_limit_mb 16384
# Approximate size of user data packed per block. Note that the
# block size specified here corresponds to uncompressed data. The
# actual size of the unit read from disk may be smaller if
# compression is enabled.
#
# Default: 16KB
rocksdb.block_size 16384
# Indicating if we'd put index/filter blocks to the block cache
#
# Default: yes
rocksdb.cache_index_and_filter_blocks yes
# Specify the compression to use. Only compress level greater
# than 2 to improve performance.
# Accept value: "no", "snappy", "lz4", "zstd", "zlib"
# default snappy
rocksdb.compression snappy
# If non-zero, we perform bigger reads when doing compaction. If you're
# running RocksDB on spinning disks, you should set this to at least 2MB.
# That way RocksDB's compaction is doing sequential instead of random reads.
# When non-zero, we also force new_table_reader_for_compaction_inputs to
# true.
#
# Default: 2 MB
rocksdb.compaction_readahead_size 2097152
# he limited write rate to DB if soft_pending_compaction_bytes_limit or
# level0_slowdown_writes_trigger is triggered.
# If the value is 0, we will infer a value from `rater_limiter` value
# if it is not empty, or 16MB if `rater_limiter` is empty. Note that
# if users change the rate in `rate_limiter` after DB is opened,
# `delayed_write_rate` won't be adjusted.
#
rocksdb.delayed_write_rate 0
# If enable_pipelined_write is true, separate write thread queue is
# maintained for WAL write and memtable write.
#
# Default: no
rocksdb.enable_pipelined_write no
# Soft limit on number of level-0 files. We start slowing down writes at this
# point. A value <0 means that no writing slow down will be triggered by
# number of files in level-0.
#
# Default: 20
rocksdb.level0_slowdown_writes_trigger 20
# Maximum number of level-0 files. We stop writes at this point.
#
# Default: 40
rocksdb.level0_stop_writes_trigger 40
# Number of files to trigger level-0 compaction.
#
# Default: 4
rocksdb.level0_file_num_compaction_trigger 4
# if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
#
# Default: 0
rocksdb.stats_dump_period_sec 0
# if yes, the auto compaction would be disabled, but the manual compaction remain works
#
# Default: no
rocksdb.disable_auto_compactions no
# BlobDB(key-value separation) is essentially RocksDB for large-value use cases.
# Since 6.18.0, The new implementation is integrated into the RocksDB core.
# When set, large values (blobs) are written to separate blob files, and only
# pointers to them are stored in SST files. This can reduce write amplification
# for large-value use cases at the cost of introducing a level of indirection
# for reads. Please see: https://github.com/facebook/rocksdb/wiki/BlobDB.
#
# Note that when enable_blob_files is set to yes, BlobDB-related configuration
# items will take effect.
#
# Default: no
rocksdb.enable_blob_files no
# The size of the smallest value to be stored separately in a blob file. Values
# which have an uncompressed size smaller than this threshold are stored alongside
# the keys in SST files in the usual fashion.
#
# Default: 4096 byte, 0 means that all values are stored in blob files
rocksdb.min_blob_size 4096
# The size limit for blob files. When writing blob files, a new file is
# opened once this limit is reached.
#
# Default: 268435456 bytes
rocksdb.blob_file_size 268435456
# Enables garbage collection of blobs. Valid blobs residing in blob files
# older than a cutoff get relocated to new files as they are encountered
# during compaction, which makes it possible to clean up blob files once
# they contain nothing but obsolete/garbage blobs.
# See also rocksdb.blob_garbage_collection_age_cutoff below.
#
# Default: yes
rocksdb.enable_blob_garbage_collection yes
# The percentage cutoff in terms of blob file age for garbage collection.
# Blobs in the oldest N blob files will be relocated when encountered during
# compaction, where N = (garbage_collection_cutoff/100) * number_of_blob_files.
# Note that this value must belong to [0, 100].
#
# Default: 25
rocksdb.blob_garbage_collection_age_cutoff 25
# The purpose of the following three options are to dynamically adjust the upper limit of
# the data that each layer can store according to the size of the different
# layers of the LSM. Enabling this option will bring some improvements in
# deletion efficiency and space amplification, but it will lose a certain
# amount of read performance.
# If you want to know more details about Levels' Target Size, you can read RocksDB wiki:
# https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#levels-target-size
#
# Default: yes
rocksdb.level_compaction_dynamic_level_bytes yes
# The total file size of level-1 sst.
#
# Default: 268435456 bytes
rocksdb.max_bytes_for_level_base 268435456
# Multiplication factor for the total file size of L(n+1) layers.
# This option is a double type number in RocksDB, but kvrocks is
# not support the double data type number yet, so we use integer
# number instead of double currently.
#
# Default: 10
rocksdb.max_bytes_for_level_multiplier 10
# This feature only takes effect in Iterators and MultiGet.
# If yes, RocksDB will try to read asynchronously and in parallel as much as possible to hide IO latency.
# In iterators, it will prefetch data asynchronously in the background for each file being iterated on.
# In MultiGet, it will read the necessary data blocks from those files in parallel as much as possible.
# Default no
rocksdb.read_options.async_io no
# If yes, the write will be flushed from the operating system
# buffer cache before the write is considered complete.
# If this flag is enabled, writes will be slower.
# If this flag is disabled, and the machine crashes, some recent
# rites may be lost. Note that if it is just the process that
# crashes (i.e., the machine does not reboot), no writes will be
# lost even if sync==false.
#
# Default: no
rocksdb.write_options.sync no
# If yes, writes will not first go to the write ahead log,
# and the write may get lost after a crash.
#
# Default: no
rocksdb.write_options.disable_wal no
# If enabled and we need to wait or sleep for the write request, fails
# immediately.
#
# Default: no
rocksdb.write_options.no_slowdown no
# If enabled, write requests are of lower priority if compaction is
# behind. In this case, no_slowdown = true, the request will be canceled
# immediately. Otherwise, it will be slowed down.
# The slowdown value is determined by RocksDB to guarantee
# it introduces minimum impacts to high priority writes.
#
# Default: no
rocksdb.write_options.low_pri no
# If enabled, this writebatch will maintain the last insert positions of each
# memtable as hints in concurrent write. It can improve write performance
# in concurrent writes if keys in one writebatch are sequential.
#
# Default: no
rocksdb.write_options.memtable_insert_hint_per_batch no
# Support RocksDB auto-tune rate limiter for the background IO
# if enabled, Rate limiter will limit the compaction write if flush write is high
# Please see https://rocksdb.org/blog/2017/12/18/17-auto-tuned-rate-limiter.html
#
# Default: yes
rocksdb.rate_limiter_auto_tuned yes
# Enable this option will schedule the deletion of obsolete files in a background thread
# on iterator destruction. It can reduce the latency if there are many files to be removed.
# see https://github.com/facebook/rocksdb/wiki/IO#avoid-blocking-io
#
# Default: yes
# rocksdb.avoid_unnecessary_blocking_io yes
################################ NAMESPACE #####################################
# namespace.test change.me
backup-dir .//backup

6
full_index/run_kvrocks.sh Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
set -e
set -x
../../kvrocks/build/kvrocks -c kvrocks.conf

View File

@ -93,6 +93,10 @@ class CaptureCache():
self.user_agent: str | None = cache_entry.get('user_agent')
self.referer: str | None = cache_entry.get('referer')
@property
def tree_ready(self) -> bool:
return bool(_pickle_path(self.capture_dir))
@property
def tree(self) -> CrawledTree:
if not self.capture_dir.exists():
@ -102,27 +106,36 @@ class CaptureCache():
return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger)
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_file = capture_dir / 'tree.pickle'
def _pickle_path(capture_dir: Path) -> Path | None:
pickle_file_gz = capture_dir / 'tree.pickle.gz'
if pickle_file.exists():
pickle_file.unlink()
if pickle_file_gz.exists():
pickle_file_gz.unlink()
return pickle_file_gz
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
return pickle_file
return None
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_path = _pickle_path(capture_dir)
if pickle_path and pickle_path.exists():
pickle_path.unlink()
@lru_cache(maxsize=64)
def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree:
pickle_file = capture_dir / 'tree.pickle'
pickle_file_gz = capture_dir / 'tree.pickle.gz'
pickle_path = _pickle_path(capture_dir)
tree = None
try:
if pickle_file.exists():
with pickle_file.open('rb') as _p:
tree = pickle.load(_p)
elif pickle_file_gz.exists():
with gzip.open(pickle_file_gz, 'rb') as _pg:
tree = pickle.load(_pg)
if pickle_path:
if pickle_path.suffix == '.gz':
with gzip.open(pickle_path, 'rb') as _pg:
tree = pickle.load(_pg)
else: # not a GZ pickle
with pickle_path.open('rb') as _p:
tree = pickle.load(_p)
except pickle.UnpicklingError:
remove_pickle_tree(capture_dir)
except EOFError:

View File

@ -95,8 +95,10 @@ def safe_create_dir(to_create: Path) -> None:
def get_socket_path(name: str) -> str:
mapping = {
'cache': Path('cache', 'cache.sock'),
'indexing': Path('indexing', 'indexing.sock'),
'indexing': Path('indexing', 'indexing.sock')
}
if get_config('generic', 'index_everything'):
mapping['full_index'] = Path('full_index', 'full_index.sock')
return str(get_homedir() / mapping[name])

View File

@ -24,24 +24,49 @@ from .default import get_socket_path, get_config
class Indexing():
def __init__(self) -> None:
def __init__(self, full_index: bool=False) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel'))
self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'))
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'), decode_responses=True)
self.__redis_pool_bytes: ConnectionPool
self.__redis_pool: ConnectionPool
if full_index:
self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('full_index'))
self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('full_index'), decode_responses=True)
else:
self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'))
self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'), decode_responses=True)
def clear_indexes(self) -> None:
self.redis.flushdb()
@property
def redis_bytes(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.redis_pool_bytes)
return Redis(connection_pool=self.__redis_pool_bytes)
@property
def redis(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.redis_pool)
return Redis(connection_pool=self.__redis_pool)
@property
def can_index(self) -> bool:
return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
def indexing_done(self) -> None:
self.redis.delete('ongoing_indexing')
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]:
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
p.sismember('indexed_cookies', capture_uuid)
p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid)
# This call for sure returns a tuple of 5 booleans
return p.execute() # type: ignore[return-value]
def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:
# only trigger this method if the capture was already indexed.

View File

@ -56,7 +56,6 @@ from .helpers import (get_captures_dir, get_email_template,
get_resources_hashes, get_taxonomies,
uniq_domains, ParsedUserAgent, load_cookies, UserAgents,
get_useragent_for_requests, make_ts_from_dirname)
from .indexing import Indexing
from .modules import (MISPs, PhishingInitiative, UniversalWhois,
UrlScan, VirusTotal, Phishtank, Hashlookup,
RiskIQ, RiskIQError, Pandora, URLhaus, CIRCLPDNS)
@ -81,7 +80,6 @@ class Lookyloo():
def __init__(self) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel'))
self.indexing = Indexing()
self.user_agents = UserAgents()
self.is_public_instance = get_config('generic', 'public_instance')
self.public_domain = get_config('generic', 'public_domain')
@ -938,214 +936,10 @@ class Lookyloo():
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
- set(ct.root_hartree.all_url_requests.keys()))
def get_body_hash_investigator(self, body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]:
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
captures = []
for capture_uuid, hostnode_uuid, hostname, _, url in details:
cache = self.capture_cache(capture_uuid)
if not cache:
continue
captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url))
domains = self.indexing.get_body_hash_domains(body_hash)
return captures, domains
def get_body_hash_full(self, body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]:
'''Returns a lot of information about the hash (sha512) and the hits in the instance.
Also contains the data (base64 encoded)'''
details = self.indexing.get_body_hash_urls(body_hash)
# Break immediately if we have the hash of the empty file
if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e':
return details, BytesIO()
# get the body from the first entry in the details list
for _, entries in details.items():
if not entries:
continue
ct = self.get_crawled_tree(entries[0]['capture'])
try:
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
except Exception:
# Unable to find URLnode in the tree, it probably has been rebuild.
self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]')
continue
# From that point, we just try to get the content. Break as soon as we found one.
if urlnode.body_hash == body_hash:
# the hash we're looking for is the whole file
return details, urlnode.body
else:
# The hash is an embedded resource
for _, blobs in urlnode.embedded_ressources.items():
for h, b in blobs:
if h == body_hash:
return details, b
# TODO: Couldn't find the file anywhere. Maybe return a warning in the file?
return details, BytesIO()
def get_all_body_hashes(self, capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]:
ct = self.get_crawled_tree(capture_uuid)
to_return: dict[str, dict[str, URLNode | int]] = defaultdict()
for node in ct.root_hartree.url_tree.traverse():
if node.empty_response or node.body_hash in to_return:
# If we have the same hash more than once, skip
continue
total_captures, details = self.indexing.get_body_hash_captures(node.body_hash, limit=-1)
# Note for future: mayeb get url, capture title, something better than just the hash to show to the user
to_return[node.body_hash] = {'node': node, 'total_captures': total_captures}
return to_return
def get_latest_url_capture(self, url: str, /) -> CaptureCache | None:
'''Get the most recent capture with this URL'''
captures = self.sorted_capture_cache(self.indexing.get_captures_url(url))
if captures:
return captures[0]
return None
def get_url_occurrences(self, url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
'''Get the most recent captures and URL nodes where the URL has been seen.'''
captures = self.sorted_capture_cache(self.indexing.get_captures_url(url), cached_captures_only=cached_captures_only)
to_return: list[dict[str, Any]] = []
for capture in captures[:limit]:
ct = self.get_crawled_tree(capture.uuid)
to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid,
'start_timestamp': capture.timestamp.isoformat(),
'title': capture.title}
urlnodes: dict[str, dict[str, str]] = {}
for urlnode in ct.root_hartree.url_tree.search_nodes(name=url):
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
'hostnode_uuid': urlnode.hostnode_uuid}
if hasattr(urlnode, 'body_hash'):
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
to_append['urlnodes'] = urlnodes
to_return.append(to_append)
return to_return
def get_hostname_occurrences(self, hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
captures = self.sorted_capture_cache(self.indexing.get_captures_hostname(hostname), cached_captures_only=cached_captures_only)
to_return: list[dict[str, Any]] = []
for capture in captures[:limit]:
ct = self.get_crawled_tree(capture.uuid)
to_append: dict[str, str | list[Any] | dict[str, Any]] = {
'capture_uuid': capture.uuid,
'start_timestamp': capture.timestamp.isoformat(),
'title': capture.title}
hostnodes: list[str] = []
if with_urls_occurrences:
urlnodes: dict[str, dict[str, str]] = {}
for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname):
hostnodes.append(hostnode.uuid)
if with_urls_occurrences:
for urlnode in hostnode.urls:
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
'url': urlnode.name,
'hostnode_uuid': urlnode.hostnode_uuid}
if hasattr(urlnode, 'body_hash'):
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
to_append['hostnodes'] = hostnodes
if with_urls_occurrences:
to_append['urlnodes'] = urlnodes
to_return.append(to_append)
return to_return
def get_cookie_name_investigator(self, cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([entry[0] for entry in self.indexing.get_cookies_names_captures(cookie_name)])
captures = [(cache.uuid, cache.title) for cache in cached_captures]
domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain))
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains
def compute_mmh3_shodan(self, favicon: bytes, /) -> str:
b64 = base64.encodebytes(favicon)
return str(mmh3.hash(b64))
def get_favicon_investigator(self, favicon_sha512: str,
/,
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str, str],
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
favicon = self.indexing.get_favicon(favicon_sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
mmh3_shodan = self.compute_mmh3_shodan(favicon)
else:
mimetype = ''
b64_favicon = ''
mmh3_shodan = ''
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
probabilistic_hash_algos = ['mmh3-shodan']
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
if get_probabilistic:
for algo in probabilistic_hash_algos:
probabilistic_favicons[algo] = {}
for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512):
probabilistic_favicons[algo][mm3hash] = {}
for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash):
if sha512 == favicon_sha512:
# Skip entry if it is the same as the favicon we are investigating
continue
favicon = self.indexing.get_favicon(sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
if not probabilistic_favicons[algo][mm3hash]:
# remove entry if it has no favicon
probabilistic_favicons[algo].pop(mm3hash)
if not probabilistic_favicons[algo]:
# remove entry if it has no hash
probabilistic_favicons.pop(algo)
return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh))
if cached_captures := self.sorted_capture_cache([entry for entry in all_captures]):
captures = []
for cache in cached_captures:
try:
urlnode = self.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid])
except Exception as e:
self.logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.')
self._captures_index.remove_pickle(cache.uuid)
continue
captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title))
# get the headers and format them as they were in the response
urlnode = self.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid])
headers = [(header["name"], header["value"]) for header in urlnode.response['headers']]
return captures, headers
return [], []
def hash_lookup(self, blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
'''Search all the captures a specific hash was seen.
If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
Capture UUID avoids duplicates on the same capture'''
captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1,
prefered_uuids=set(self._captures_index.keys()))
for h_capture_uuid, url_uuid, url_hostname, same_url, url in details:
cache = self.capture_cache(h_capture_uuid)
if cache and hasattr(cache, 'title'):
if same_url:
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
else:
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
# Sort by timestamp by default
captures_list['same_url'].sort(key=lambda y: y[3])
captures_list['different_url'].sort(key=lambda y: y[3])
return total_captures, captures_list
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: str | None) -> tuple[str, BytesIO, str] | None:
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
try:
@ -1375,116 +1169,6 @@ class Lookyloo():
"""Get the preconfigured devices from Playwright"""
return get_devices()
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> tuple[HostNode, list[dict[str, Any]]]:
'''Gather all the informations needed to display the Hostnode investigator popup.'''
def normalize_known_content(h: str, /, known_content: dict[str, Any], url: URLNode) -> tuple[str | list[Any] | None, tuple[bool, Any] | None]:
''' There are a few different sources to figure out known vs. legitimate content,
this method normalize it for the web interface.'''
known: str | list[Any] | None = None
legitimate: tuple[bool, Any] | None = None
if h not in known_content:
return known, legitimate
if known_content[h]['type'] in ['generic', 'sanejs']:
known = known_content[h]['details']
elif known_content[h]['type'] == 'legitimate_on_domain':
legit = False
if url.hostname in known_content[h]['details']:
legit = True
legitimate = (legit, known_content[h]['details'])
elif known_content[h]['type'] == 'malicious':
legitimate = (False, known_content[h]['details'])
return known, legitimate
ct = self.get_crawled_tree(capture_uuid)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
known_content = self.context.find_known_content(hostnode)
self.uwhois.query_whois_hostnode(hostnode)
urls: list[dict[str, Any]] = []
for url in hostnode.urls:
# For the popup, we need:
# * https vs http
# * everything after the domain
# * the full URL
to_append: dict[str, Any] = {
'encrypted': url.name.startswith('https'),
'url_path': url.name.split('/', 3)[-1],
'url_object': url,
}
if not url.empty_response:
# Index lookup
# %%% Full body %%%
freq = self.indexing.body_hash_fequency(url.body_hash)
to_append['body_hash_details'] = freq
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
to_append['body_hash_details']['other_captures'] = self.hash_lookup(url.body_hash, url.name, capture_uuid)
# %%% Embedded ressources %%%
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
to_append['embedded_ressources'] = {}
for mimetype, blobs in url.embedded_ressources.items():
for h, blob in blobs:
if h in to_append['embedded_ressources']:
# Skip duplicates
continue
freq_embedded = self.indexing.body_hash_fequency(h)
to_append['embedded_ressources'][h] = freq_embedded
to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes
to_append['embedded_ressources'][h]['type'] = mimetype
if freq_embedded['hash_freq'] > 1:
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
for h in to_append['embedded_ressources'].keys():
known, legitimate = normalize_known_content(h, known_content, url)
if known:
to_append['embedded_ressources'][h]['known_content'] = known
elif legitimate:
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
known, legitimate = normalize_known_content(url.body_hash, known_content, url)
if known:
to_append['known_content'] = known
elif legitimate:
to_append['legitimacy'] = legitimate
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
if hasattr(url, 'cookies_sent'):
to_display_sent: dict[str, set[Iterable[str | None]]] = defaultdict(set)
for cookie, contexts in url.cookies_sent.items():
if not contexts:
# Locally created?
to_display_sent[cookie].add(('Unknown origin', ))
continue
for context in contexts:
to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid))
to_append['cookies_sent'] = to_display_sent
# Optional: Cookies received from server in response -> map to nodes who send the cookie in request
if hasattr(url, 'cookies_received'):
to_display_received: dict[str, dict[str, set[Iterable[str | None]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)}
for domain, c_received, is_3rd_party in url.cookies_received:
if c_received not in ct.root_hartree.cookies_sent:
# This cookie is never sent.
if is_3rd_party:
to_display_received['3rd_party'][c_received].add((domain, ))
else:
to_display_received['not_sent'][c_received].add((domain, ))
continue
for url_node in ct.root_hartree.cookies_sent[c_received]:
if is_3rd_party:
to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
else:
to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
to_append['cookies_received'] = to_display_received
urls.append(to_append)
return hostnode, urls
def get_stats(self) -> dict[str, list[Any]]:
'''Gather statistics about the lookyloo instance'''
today = date.today()

104
poetry.lock generated
View File

@ -1,25 +1,25 @@
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
[[package]]
name = "aiobotocore"
version = "2.11.2"
version = "2.12.1"
description = "Async client for aws services using botocore and aiohttp"
optional = false
python-versions = ">=3.8"
files = [
{file = "aiobotocore-2.11.2-py3-none-any.whl", hash = "sha256:487fede588040bfa3a43df945275c28c1c73ca75bf705295adb9fbadd2e89be7"},
{file = "aiobotocore-2.11.2.tar.gz", hash = "sha256:6dd7352248e3523019c5a54a395d2b1c31080697fc80a9ad2672de4eec8c7abd"},
{file = "aiobotocore-2.12.1-py3-none-any.whl", hash = "sha256:6a9a3d646cf422f45fdc1e4256e78563ebffba64733bc9b8ca9123614e8ba9af"},
{file = "aiobotocore-2.12.1.tar.gz", hash = "sha256:8706b28f16f93c541f6ed50352115a79d8f3499539f8d0bb70aa0f7a5379c1fe"},
]
[package.dependencies]
aiohttp = ">=3.7.4.post0,<4.0.0"
aioitertools = ">=0.5.1,<1.0.0"
botocore = ">=1.33.2,<1.34.35"
botocore = ">=1.34.41,<1.34.52"
wrapt = ">=1.10.10,<2.0.0"
[package.extras]
awscli = ["awscli (>=1.31.2,<1.32.35)"]
boto3 = ["boto3 (>=1.33.2,<1.34.35)"]
awscli = ["awscli (>=1.32.41,<1.32.52)"]
boto3 = ["boto3 (>=1.34.41,<1.34.52)"]
[[package]]
name = "aiohttp"
@ -308,13 +308,13 @@ WTForms = "*"
[[package]]
name = "botocore"
version = "1.34.34"
version = "1.34.51"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">= 3.8"
files = [
{file = "botocore-1.34.34-py3-none-any.whl", hash = "sha256:cd060b0d88ebb2b893f1411c1db7f2ba66cc18e52dcc57ad029564ef5fec437b"},
{file = "botocore-1.34.34.tar.gz", hash = "sha256:54093dc97372bb7683f5c61a279aa8240408abf3b2cc494ae82a9a90c1b784b5"},
{file = "botocore-1.34.51-py3-none-any.whl", hash = "sha256:01d5156247f991b3466a8404e3d7460a9ecbd9b214f9992d6ba797d9ddc6f120"},
{file = "botocore-1.34.51.tar.gz", hash = "sha256:5086217442e67dd9de36ec7e87a0c663f76b7790d5fb6a12de565af95e87e319"},
]
[package.dependencies]
@ -1331,13 +1331,13 @@ test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.22)", "pa
[[package]]
name = "ipython"
version = "8.22.1"
version = "8.22.2"
description = "IPython: Productive Interactive Computing"
optional = false
python-versions = ">=3.10"
files = [
{file = "ipython-8.22.1-py3-none-any.whl", hash = "sha256:869335e8cded62ffb6fac8928e5287a05433d6462e3ebaac25f4216474dd6bc4"},
{file = "ipython-8.22.1.tar.gz", hash = "sha256:39c6f9efc079fb19bfb0f17eee903978fe9a290b1b82d68196c641cecb76ea22"},
{file = "ipython-8.22.2-py3-none-any.whl", hash = "sha256:3c86f284c8f3d8f2b6c662f885c4889a91df7cd52056fd02b7d8d6195d7f56e9"},
{file = "ipython-8.22.2.tar.gz", hash = "sha256:2dcaad9049f9056f1fef63514f176c7d41f930daa78d05b82a176202818f2c14"},
]
[package.dependencies]
@ -1463,20 +1463,21 @@ referencing = ">=0.31.0"
[[package]]
name = "lacuscore"
version = "1.8.7"
version = "1.8.8"
description = "Core of Lacus, usable as a module"
optional = false
python-versions = ">=3.8,<4.0"
files = [
{file = "lacuscore-1.8.7-py3-none-any.whl", hash = "sha256:1ac849b1308eb780f1976fdf21d6476bd911e6ae1e91f79d83612baef90afee8"},
{file = "lacuscore-1.8.7.tar.gz", hash = "sha256:67268fb4da1282d1c7f747b02611dd5ee549644e034d0acba2173396ce0d0408"},
{file = "lacuscore-1.8.8-py3-none-any.whl", hash = "sha256:04812225e101ec59b3d1dcc6d3474e3cd2f3fd656a72d619e7d7d238d00b5a27"},
{file = "lacuscore-1.8.8.tar.gz", hash = "sha256:41949ff67d056f8ba717b649d8b45307ff7d38d4c38291cb1a8b80ca2ce94f6f"},
]
[package.dependencies]
async-timeout = {version = ">=4.0.3,<5.0.0", markers = "python_version < \"3.11\""}
defang = ">=0.5.3,<0.6.0"
dnspython = ">=2.6.1,<3.0.0"
playwrightcapture = {version = ">=1.23.8,<2.0.0", extras = ["recaptcha"]}
redis = {version = ">=5.0.1,<6.0.0", extras = ["hiredis"]}
playwrightcapture = {version = ">=1.23.9,<2.0.0", extras = ["recaptcha"]}
redis = {version = ">=5.0.2,<6.0.0", extras = ["hiredis"]}
requests = ">=2.31.0,<3.0.0"
ua-parser = ">=0.18.0,<0.19.0"
@ -2286,13 +2287,13 @@ test = ["pytest"]
[[package]]
name = "playwrightcapture"
version = "1.23.8"
version = "1.23.9"
description = "A simple library to capture websites using playwright"
optional = false
python-versions = ">=3.8,<4.0"
files = [
{file = "playwrightcapture-1.23.8-py3-none-any.whl", hash = "sha256:f3e4d6c0355b013e465f9d3eea961b9431303a5de227a1388a7287c872203b9e"},
{file = "playwrightcapture-1.23.8.tar.gz", hash = "sha256:d2caea8d7a16d739f28dc06bbbc12665be89d07d325bba6868dab5f8520db809"},
{file = "playwrightcapture-1.23.9-py3-none-any.whl", hash = "sha256:0324f587605aa85ede1b71c12ec735383d932324f0e66ef35345c6e08734273c"},
{file = "playwrightcapture-1.23.9.tar.gz", hash = "sha256:e7217fc2a6109f240918de977452c556f482822abb12f0db43fa28228d3c0c90"},
]
[package.dependencies]
@ -2339,13 +2340,13 @@ files = [
[[package]]
name = "publicsuffixlist"
version = "0.10.0.20240214"
version = "0.10.0.20240305"
description = "publicsuffixlist implement"
optional = false
python-versions = ">=2.6"
files = [
{file = "publicsuffixlist-0.10.0.20240214-py2.py3-none-any.whl", hash = "sha256:2c3b8da819571bb610328bda5b25d27fcbf6bc400896ca3c6502d291a16b32f4"},
{file = "publicsuffixlist-0.10.0.20240214.tar.gz", hash = "sha256:45a206c5f9c1eccf138481280cfb0a67c2ccafc782ef89c7fd6dc6c4356230fe"},
{file = "publicsuffixlist-0.10.0.20240305-py2.py3-none-any.whl", hash = "sha256:f6869119f8781501c0c625e59b4b65eb60e2ed5185cfd6c142c792f74ac47c21"},
{file = "publicsuffixlist-0.10.0.20240305.tar.gz", hash = "sha256:6e79ea73b0278ce1b102f3ad6815f2a5b683864da9948ba0b0eab3180c419f7f"},
]
[package.extras]
@ -2571,13 +2572,13 @@ docs = ["Sphinx (<7.2)", "Sphinx (>=7.2,<8.0)"]
[[package]]
name = "pymisp"
version = "2.4.185"
version = "2.4.186"
description = "Python API for MISP."
optional = false
python-versions = ">=3.8,<4.0"
files = [
{file = "pymisp-2.4.185-py3-none-any.whl", hash = "sha256:e2635a2be92321d4f812c7220bd955817e95a286343720f138b87892a827117a"},
{file = "pymisp-2.4.185.tar.gz", hash = "sha256:3ccdc6ee48d26d82c77ba3f5d8fd41a79eaaef0ad5619f37a65b060e92f6da4c"},
{file = "pymisp-2.4.186-py3-none-any.whl", hash = "sha256:bb8ae23d038848a86cf5d6a4c965dbed79e48cd6f671681b17f72410aecf07a0"},
{file = "pymisp-2.4.186.tar.gz", hash = "sha256:bdf2d54b297ad890418179b044dd4ea79821fccef723823919d12262e9794ca3"},
]
[package.dependencies]
@ -2593,7 +2594,7 @@ requests = ">=2.31.0,<3.0.0"
[package.extras]
brotli = ["urllib3[brotli]"]
docs = ["Sphinx (<7.2)", "Sphinx (>=7.2,<8.0)", "recommonmark (>=0.7.1,<0.8.0)", "sphinx-autodoc-typehints (>=2.0.0,<3.0.0)"]
email = ["RTFDE (>=0.1.1,<0.2.0)", "extract_msg (>=0.47.0,<0.48.0)", "oletools (>=0.60.1,<0.61.0)"]
email = ["RTFDE (>=0.1.1,<0.2.0)", "extract_msg (>=0.48.0,<0.49.0)", "oletools (>=0.60.1,<0.61.0)"]
fileobjects = ["lief (>=0.14.1,<0.15.0)", "pydeep2 (>=0.5.1,<0.6.0)", "python-magic (>=0.4.27,<0.5.0)"]
openioc = ["beautifulsoup4 (>=4.12.3,<5.0.0)"]
pdfexport = ["reportlab (>=4.1.0,<5.0.0)"]
@ -2668,13 +2669,13 @@ requests = ">=2.31.0,<3.0.0"
[[package]]
name = "pysecuritytxt"
version = "1.2.2"
version = "1.3.0"
description = "Python CLI and module for querying security.txt files on domains."
optional = false
python-versions = ">=3.8,<4.0"
files = [
{file = "pysecuritytxt-1.2.2-py3-none-any.whl", hash = "sha256:08d8750d82e9502ba949a6ea7bab355ca183cfc3cd722ed3e492ba35a8d4edda"},
{file = "pysecuritytxt-1.2.2.tar.gz", hash = "sha256:31d4ea4814e2cdeffce304e7b6f9d58580e7fb6578c8694bb6f8c0df59e65b3d"},
{file = "pysecuritytxt-1.3.0-py3-none-any.whl", hash = "sha256:9e4eb6b4fdca8f8471c80696c4d7642be24d44c8c3f627870ca9b7bd3f221cd5"},
{file = "pysecuritytxt-1.3.0.tar.gz", hash = "sha256:3669be69e90672ed0d448b385e5fef49cb3a6a611d7e386d673c4f0e1cc3e83b"},
]
[package.dependencies]
@ -2712,13 +2713,13 @@ webui = ["Flask (>=2.0,<3.0)", "Flask-Bootstrap (>=3.3.7.1,<4.0.0.0)", "Flask-WT
[[package]]
name = "python-dateutil"
version = "2.8.2"
version = "2.9.0.post0"
description = "Extensions to the standard Python datetime module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
files = [
{file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
{file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
]
[package.dependencies]
@ -2748,17 +2749,17 @@ files = [
[[package]]
name = "redis"
version = "5.0.1"
version = "5.0.2"
description = "Python client for Redis database and key-value store"
optional = false
python-versions = ">=3.7"
files = [
{file = "redis-5.0.1-py3-none-any.whl", hash = "sha256:ed4802971884ae19d640775ba3b03aa2e7bd5e8fb8dfaed2decce4d0fc48391f"},
{file = "redis-5.0.1.tar.gz", hash = "sha256:0dab495cd5753069d3bc650a0dde8a8f9edde16fc5691b689a566eda58100d0f"},
{file = "redis-5.0.2-py3-none-any.whl", hash = "sha256:4caa8e1fcb6f3c0ef28dba99535101d80934b7d4cd541bbb47f4a3826ee472d1"},
{file = "redis-5.0.2.tar.gz", hash = "sha256:3f82cc80d350e93042c8e6e7a5d0596e4dd68715babffba79492733e1f367037"},
]
[package.dependencies]
async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2\""}
async-timeout = ">=4.0.3"
hiredis = {version = ">=1.0.0", optional = true, markers = "extra == \"hiredis\""}
[package.extras]
@ -2942,6 +2943,7 @@ optional = false
python-versions = "*"
files = [
{file = "requests-file-2.0.0.tar.gz", hash = "sha256:20c5931629c558fda566cacc10cfe2cd502433e628f568c34c80d96a0cc95972"},
{file = "requests_file-2.0.0-py2.py3-none-any.whl", hash = "sha256:3e493d390adb44aa102ebea827a48717336d5268968c370eaf19abaf5cae13bf"},
]
[package.dependencies]
@ -2949,13 +2951,13 @@ requests = ">=1.0.0"
[[package]]
name = "rich"
version = "13.7.0"
version = "13.7.1"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false
python-versions = ">=3.7.0"
files = [
{file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"},
{file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"},
{file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"},
{file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"},
]
[package.dependencies]
@ -3217,13 +3219,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,
[[package]]
name = "types-beautifulsoup4"
version = "4.12.0.20240106"
version = "4.12.0.20240229"
description = "Typing stubs for beautifulsoup4"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-beautifulsoup4-4.12.0.20240106.tar.gz", hash = "sha256:98d628985b71b140bd3bc22a8cb0ab603c2f2d08f20d37925965eb4a21739be8"},
{file = "types_beautifulsoup4-4.12.0.20240106-py3-none-any.whl", hash = "sha256:cbdd60ab8aeac737ac014431b6e921b43e84279c0405fdd25a6900bb0e71da5b"},
{file = "types-beautifulsoup4-4.12.0.20240229.tar.gz", hash = "sha256:e37e4cfa11b03b01775732e56d2c010cb24ee107786277bae6bc0fa3e305b686"},
{file = "types_beautifulsoup4-4.12.0.20240229-py3-none-any.whl", hash = "sha256:000cdddb8aee4effb45a04be95654de8629fb8594a4f2f1231cff81108977324"},
]
[package.dependencies]
@ -3242,13 +3244,13 @@ files = [
[[package]]
name = "types-html5lib"
version = "1.1.11.20240222"
version = "1.1.11.20240228"
description = "Typing stubs for html5lib"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-html5lib-1.1.11.20240222.tar.gz", hash = "sha256:d9517ec6ba2fa1f63113e2930a59b60722a976cc983b94d7fd772f14865e1152"},
{file = "types_html5lib-1.1.11.20240222-py3-none-any.whl", hash = "sha256:86b2dcbbebca846e68d2eac46b2717980e632de4b5d8f62ccd23d8333d2e7647"},
{file = "types-html5lib-1.1.11.20240228.tar.gz", hash = "sha256:22736b7299e605ec4ba539d48691e905fd0c61c3ea610acc59922232dc84cede"},
{file = "types_html5lib-1.1.11.20240228-py3-none-any.whl", hash = "sha256:af5de0125cb0fe5667543b158db83849b22e25c0e36c9149836b095548bf1020"},
]
[[package]]
@ -3275,13 +3277,13 @@ files = [
[[package]]
name = "types-pyopenssl"
version = "24.0.0.20240130"
version = "24.0.0.20240228"
description = "Typing stubs for pyOpenSSL"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-pyOpenSSL-24.0.0.20240130.tar.gz", hash = "sha256:c812e5c1c35249f75ef5935708b2a997d62abf9745be222e5f94b9595472ab25"},
{file = "types_pyOpenSSL-24.0.0.20240130-py3-none-any.whl", hash = "sha256:24a255458b5b8a7fca8139cf56f2a8ad5a4f1a5f711b73a5bb9cb50dc688fab5"},
{file = "types-pyOpenSSL-24.0.0.20240228.tar.gz", hash = "sha256:cd990717d8aa3743ef0e73e0f462e64b54d90c304249232d48fece4f0f7c3c6a"},
{file = "types_pyOpenSSL-24.0.0.20240228-py3-none-any.whl", hash = "sha256:a472cf877a873549175e81972f153f44e975302a3cf17381eb5f3d41ccfb75a4"},
]
[package.dependencies]
@ -3734,4 +3736,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<3.13"
content-hash = "bc64701a9d95985f7d0c91086fabfc29a1c54affc60bfab612fecc3771d6acd4"
content-hash = "7e76c4614efed850e101ecaa1e91f141649ef4ad508522f0323e8efffc9eda7d"

View File

@ -29,6 +29,8 @@ shutdown = "bin.shutdown:main"
run_backend = "bin.run_backend:main"
async_capture = "bin.async_capture:main"
background_indexer = "bin.background_indexer:main"
background_build_captures = "bin.background_build_captures:main"
background_full_indexer = "bin.background_indexer:main_full_indexer"
archiver = "bin.archiver:main"
processing = "bin.background_processing:main"
start_website = "bin.start_website:main"
@ -40,7 +42,7 @@ requests = "^2.31.0"
flask = "^3.0.2"
gunicorn = "^21.2.0"
charset-normalizer = "^3.3.2"
redis = {version = "^5.0.1", extras = ["hiredis"]}
redis = {version = "^5.0.2", extras = ["hiredis"]}
beautifulsoup4 = {version = "^4.12.3", extras = ["lxml", "charset_normalizer"]}
bootstrap-flask = "^2.3.3"
defang = "^0.5.3"
@ -50,10 +52,10 @@ pysanejs = "^2.0.2"
pylookyloo = "^1.23.1"
dnspython = "^2.6.1"
pytaxonomies = "^1.5.0"
pymisp = {version = "^2.4.185", extras = ["url", "fileobjects"]}
pymisp = {version = "^2.4.186", extras = ["url", "fileobjects"]}
Pillow = "^10.2.0"
flask-restx = "^1.3.0"
rich = "^13.7.0"
rich = "^13.7.1"
pyphishtanklookup = "^1.3.2"
Flask-Cors = "^4.0.0"
pyhashlookup = "^1.2.2"
@ -65,13 +67,13 @@ passivetotal = "^2.5.9"
werkzeug = "^3.0.1"
filetype = "^1.2.0"
pypandora = "^1.8.0"
lacuscore = "^1.8.7"
lacuscore = "^1.8.8"
pylacus = "^1.8.0"
pyipasnhistory = "^2.1.2"
publicsuffixlist = "^0.10.0.20240205"
pyfaup = "^1.2"
chardet = "^5.2.0"
pysecuritytxt = "^1.2.2"
pysecuritytxt = "^1.3.0"
pylookyloomonitoring = "^1.1.3"
pytz = {"version" = "^2024.1", python = "<3.9"}
s3fs = "^2024.2.0"
@ -98,7 +100,7 @@ types-redis = {version = "^4.6.0.20240218"}
types-pkg-resources = "^0.1.3"
types-Deprecated = "^1.2.9.20240106"
types-python-dateutil = "^2.8.19.20240106"
types-beautifulsoup4 = "^4.12.0.20240106"
types-beautifulsoup4 = "^4.12.0.20240229"
types-Pillow = "^10.2.0.20240213"
types-pytz = "^2024.1.0.20240203"

View File

@ -17,14 +17,16 @@ import time
import filetype # type: ignore[import-untyped]
from collections import defaultdict
from datetime import date, datetime, timedelta, timezone
from importlib.metadata import version
from io import BytesIO, StringIO
from typing import Any, TypedDict
from typing import Any, TypedDict, Iterable
from urllib.parse import quote_plus, unquote_plus, urlparse
from uuid import uuid4
from zipfile import ZipFile
from har2tree import HostNode, URLNode
import flask_login # type: ignore[import-untyped]
from flask import (Flask, Response, Request, flash, jsonify, redirect, render_template,
request, send_file, url_for)
@ -37,7 +39,8 @@ from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined]
from werkzeug.security import check_password_hash
from werkzeug.wrappers.response import Response as WerkzeugResponse
from lookyloo import Lookyloo, CaptureSettings
from lookyloo import Lookyloo, CaptureSettings, Indexing
from lookyloo.capturecache import CaptureCache
from lookyloo.default import get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.helpers import get_taxonomies, UserAgents, load_cookies
@ -262,6 +265,353 @@ def file_response(func): # type: ignore[no-untyped-def]
return wrapper
# ##### Methods querying the indexes #####
@functools.cache
def get_indexing(user: User | None) -> Indexing:
'''Depending if we're logged in or not, we (can) get different indexes:
if index_everything is enabled, we have an index in kvrocks that contains all
the indexes for all the captures.
It is only accessible to the admin user.
'''
if not get_config('generic', 'index_everything'):
return Indexing()
if not user or not user.is_authenticated:
# No user or anonymous
return Indexing()
# Logged in user
return Indexing(full_index=True)
def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]:
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(body_hash, limit=-1)
captures = []
for capture_uuid, hostnode_uuid, hostname, _, url in details:
cache = lookyloo.capture_cache(capture_uuid)
if not cache:
continue
captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url))
domains = get_indexing(flask_login.current_user).get_body_hash_domains(body_hash)
return captures, domains
def get_body_hash_full(body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]:
'''Returns a lot of information about the hash (sha512) and the hits in the instance.
Also contains the data (base64 encoded)'''
details = get_indexing(flask_login.current_user).get_body_hash_urls(body_hash)
# Break immediately if we have the hash of the empty file
if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e':
return details, BytesIO()
# get the body from the first entry in the details list
for _, entries in details.items():
if not entries:
continue
ct = lookyloo.get_crawled_tree(entries[0]['capture'])
try:
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
except Exception:
# Unable to find URLnode in the tree, it probably has been rebuild.
# TODO throw a log line or something
# self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]')
# lookyloo._captures_index.remove_pickle(<capture UUID>)
continue
# From that point, we just try to get the content. Break as soon as we found one.
if urlnode.body_hash == body_hash:
# the hash we're looking for is the whole file
return details, urlnode.body
else:
# The hash is an embedded resource
for _, blobs in urlnode.embedded_ressources.items():
for h, b in blobs:
if h == body_hash:
return details, b
# TODO: Couldn't find the file anywhere. Maybe return a warning in the file?
return details, BytesIO()
def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]:
ct = lookyloo.get_crawled_tree(capture_uuid)
to_return: dict[str, dict[str, URLNode | int]] = defaultdict()
for node in ct.root_hartree.url_tree.traverse():
if node.empty_response or node.body_hash in to_return:
# If we have the same hash more than once, skip
continue
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(node.body_hash, limit=-1)
# Note for future: mayeb get url, capture title, something better than just the hash to show to the user
to_return[node.body_hash] = {'node': node, 'total_captures': total_captures}
return to_return
def get_latest_url_capture(url: str, /) -> CaptureCache | None:
'''Get the most recent capture with this URL'''
captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url))
if captures:
return captures[0]
return None
def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
'''Get the most recent captures and URL nodes where the URL has been seen.'''
captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url), cached_captures_only=cached_captures_only)
to_return: list[dict[str, Any]] = []
for capture in captures[:limit]:
ct = lookyloo.get_crawled_tree(capture.uuid)
to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid,
'start_timestamp': capture.timestamp.isoformat(),
'title': capture.title}
urlnodes: dict[str, dict[str, str]] = {}
for urlnode in ct.root_hartree.url_tree.search_nodes(name=url):
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
'hostnode_uuid': urlnode.hostnode_uuid}
if hasattr(urlnode, 'body_hash'):
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
to_append['urlnodes'] = urlnodes
to_return.append(to_append)
return to_return
def get_hostname_occurrences(hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_hostname(hostname), cached_captures_only=cached_captures_only)
to_return: list[dict[str, Any]] = []
for capture in captures[:limit]:
ct = lookyloo.get_crawled_tree(capture.uuid)
to_append: dict[str, str | list[Any] | dict[str, Any]] = {
'capture_uuid': capture.uuid,
'start_timestamp': capture.timestamp.isoformat(),
'title': capture.title}
hostnodes: list[str] = []
if with_urls_occurrences:
urlnodes: dict[str, dict[str, str]] = {}
for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname):
hostnodes.append(hostnode.uuid)
if with_urls_occurrences:
for urlnode in hostnode.urls:
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
'url': urlnode.name,
'hostnode_uuid': urlnode.hostnode_uuid}
if hasattr(urlnode, 'body_hash'):
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
to_append['hostnodes'] = hostnodes
if with_urls_occurrences:
to_append['urlnodes'] = urlnodes
to_return.append(to_append)
return to_return
def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = lookyloo.sorted_capture_cache([entry[0] for entry in get_indexing(flask_login.current_user).get_cookies_names_captures(cookie_name)])
captures = [(cache.uuid, cache.title) for cache in cached_captures]
domains = [(domain, freq, get_indexing(flask_login.current_user).cookies_names_domains_values(cookie_name, domain))
for domain, freq in get_indexing(flask_login.current_user).get_cookie_domains(cookie_name)]
return captures, domains
def get_favicon_investigator(favicon_sha512: str,
/,
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str, str],
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
favicon = get_indexing(flask_login.current_user).get_favicon(favicon_sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon)
else:
mimetype = ''
b64_favicon = ''
mmh3_shodan = ''
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
probabilistic_hash_algos = ['mmh3-shodan']
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
if get_probabilistic:
for algo in probabilistic_hash_algos:
probabilistic_favicons[algo] = {}
for mm3hash in get_indexing(flask_login.current_user).get_probabilistic_hashes_favicon(algo, favicon_sha512):
probabilistic_favicons[algo][mm3hash] = {}
for sha512 in get_indexing(flask_login.current_user).get_hashes_favicon_probablistic(algo, mm3hash):
if sha512 == favicon_sha512:
# Skip entry if it is the same as the favicon we are investigating
continue
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
if not probabilistic_favicons[algo][mm3hash]:
# remove entry if it has no favicon
probabilistic_favicons[algo].pop(mm3hash)
if not probabilistic_favicons[algo]:
# remove entry if it has no hash
probabilistic_favicons.pop(algo)
return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons
def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
all_captures = dict(get_indexing(flask_login.current_user).get_http_headers_hashes_captures(hhh))
if cached_captures := lookyloo.sorted_capture_cache([entry for entry in all_captures]):
captures = []
for cache in cached_captures:
try:
urlnode = lookyloo.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid])
except Exception:
# NOTE: print a logline
# logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.')
lookyloo._captures_index.remove_pickle(cache.uuid)
continue
captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title))
# get the headers and format them as they were in the response
urlnode = lookyloo.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid])
headers = [(header["name"], header["value"]) for header in urlnode.response['headers']]
return captures, headers
return [], []
def hash_lookup(blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
'''Search all the captures a specific hash was seen.
If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
Capture UUID avoids duplicates on the same capture'''
captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1,
prefered_uuids=set(lookyloo._captures_index.keys()))
for h_capture_uuid, url_uuid, url_hostname, same_url, url in details:
cache = lookyloo.capture_cache(h_capture_uuid)
if cache and hasattr(cache, 'title'):
if same_url:
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
else:
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
# Sort by timestamp by default
captures_list['same_url'].sort(key=lambda y: y[3])
captures_list['different_url'].sort(key=lambda y: y[3])
return total_captures, captures_list
def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[HostNode, list[dict[str, Any]]]:
'''Gather all the informations needed to display the Hostnode investigator popup.'''
def normalize_known_content(h: str, /, known_content: dict[str, Any], url: URLNode) -> tuple[str | list[Any] | None, tuple[bool, Any] | None]:
''' There are a few different sources to figure out known vs. legitimate content,
this method normalize it for the web interface.'''
known: str | list[Any] | None = None
legitimate: tuple[bool, Any] | None = None
if h not in known_content:
return known, legitimate
if known_content[h]['type'] in ['generic', 'sanejs']:
known = known_content[h]['details']
elif known_content[h]['type'] == 'legitimate_on_domain':
legit = False
if url.hostname in known_content[h]['details']:
legit = True
legitimate = (legit, known_content[h]['details'])
elif known_content[h]['type'] == 'malicious':
legitimate = (False, known_content[h]['details'])
return known, legitimate
ct = lookyloo.get_crawled_tree(capture_uuid)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
known_content = lookyloo.context.find_known_content(hostnode)
lookyloo.uwhois.query_whois_hostnode(hostnode)
urls: list[dict[str, Any]] = []
for url in hostnode.urls:
# For the popup, we need:
# * https vs http
# * everything after the domain
# * the full URL
to_append: dict[str, Any] = {
'encrypted': url.name.startswith('https'),
'url_path': url.name.split('/', 3)[-1],
'url_object': url,
}
if not url.empty_response:
# Index lookup
# %%% Full body %%%
freq = get_indexing(flask_login.current_user).body_hash_fequency(url.body_hash)
to_append['body_hash_details'] = freq
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid)
# %%% Embedded ressources %%%
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
to_append['embedded_ressources'] = {}
for mimetype, blobs in url.embedded_ressources.items():
for h, blob in blobs:
if h in to_append['embedded_ressources']:
# Skip duplicates
continue
freq_embedded = get_indexing(flask_login.current_user).body_hash_fequency(h)
to_append['embedded_ressources'][h] = freq_embedded
to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes
to_append['embedded_ressources'][h]['type'] = mimetype
if freq_embedded['hash_freq'] > 1:
to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid)
for h in to_append['embedded_ressources'].keys():
known, legitimate = normalize_known_content(h, known_content, url)
if known:
to_append['embedded_ressources'][h]['known_content'] = known
elif legitimate:
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
known, legitimate = normalize_known_content(url.body_hash, known_content, url)
if known:
to_append['known_content'] = known
elif legitimate:
to_append['legitimacy'] = legitimate
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
if hasattr(url, 'cookies_sent'):
to_display_sent: dict[str, set[Iterable[str | None]]] = defaultdict(set)
for cookie, contexts in url.cookies_sent.items():
if not contexts:
# Locally created?
to_display_sent[cookie].add(('Unknown origin', ))
continue
for context in contexts:
to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid))
to_append['cookies_sent'] = to_display_sent
# Optional: Cookies received from server in response -> map to nodes who send the cookie in request
if hasattr(url, 'cookies_received'):
to_display_received: dict[str, dict[str, set[Iterable[str | None]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)}
for domain, c_received, is_3rd_party in url.cookies_received:
if c_received not in ct.root_hartree.cookies_sent:
# This cookie is never sent.
if is_3rd_party:
to_display_received['3rd_party'][c_received].add((domain, ))
else:
to_display_received['not_sent'][c_received].add((domain, ))
continue
for url_node in ct.root_hartree.cookies_sent[c_received]:
if is_3rd_party:
to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
else:
to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
to_append['cookies_received'] = to_display_received
urls.append(to_append)
return hostnode, urls
# ##### Hostnode level methods #####
@app.route('/tree/<string:tree_uuid>/host/<string:node_uuid>/hashes', methods=['GET'])
@ -283,7 +633,7 @@ def urls_hostnode(tree_uuid: str, node_uuid: str) -> Response:
@app.route('/tree/<string:tree_uuid>/host/<string:node_uuid>', methods=['GET'])
def hostnode_popup(tree_uuid: str, node_uuid: str) -> str | WerkzeugResponse | Response:
try:
hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
hostnode, urls = get_hostnode_investigator(tree_uuid, node_uuid)
except IndexError:
return render_template('error.html', error_message='Sorry, this one is on us. The tree was rebuild, please reload the tree and try again.')
@ -850,8 +1200,8 @@ def tree_favicons(tree_uuid: str) -> str:
continue
mimetype = from_string(favicon, mime=True)
favicon_sha512 = hashlib.sha512(favicon).hexdigest()
frequency = lookyloo.indexing.favicon_frequency(favicon_sha512)
number_captures = lookyloo.indexing.favicon_number_captures(favicon_sha512)
frequency = get_indexing(flask_login.current_user).favicon_frequency(favicon_sha512)
number_captures = get_indexing(flask_login.current_user).favicon_number_captures(favicon_sha512)
b64_favicon = base64.b64encode(favicon).decode()
mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon)
favicons.append((favicon_sha512, frequency, number_captures, mimetype, b64_favicon, mmh3_shodan))
@ -860,7 +1210,7 @@ def tree_favicons(tree_uuid: str) -> str:
@app.route('/tree/<string:tree_uuid>/body_hashes', methods=['GET'])
def tree_body_hashes(tree_uuid: str) -> str:
body_hashes = lookyloo.get_all_body_hashes(tree_uuid)
body_hashes = get_all_body_hashes(tree_uuid)
return render_template('tree_body_hashes.html', tree_uuid=tree_uuid, body_hashes=body_hashes)
@ -958,27 +1308,27 @@ def index_hidden() -> str:
@app.route('/cookies', methods=['GET'])
def cookies_lookup() -> str:
cookies_names = [(name, freq, lookyloo.indexing.cookies_names_number_domains(name))
for name, freq in lookyloo.indexing.cookies_names]
cookies_names = [(name, freq, get_indexing(flask_login.current_user).cookies_names_number_domains(name))
for name, freq in get_indexing(flask_login.current_user).cookies_names]
return render_template('cookies.html', cookies_names=cookies_names)
@app.route('/hhhashes', methods=['GET'])
def hhhashes_lookup() -> str:
hhhashes = [(hhh, freq, lookyloo.indexing.http_headers_hashes_number_captures(hhh))
for hhh, freq in lookyloo.indexing.http_headers_hashes]
hhhashes = [(hhh, freq, get_indexing(flask_login.current_user).http_headers_hashes_number_captures(hhh))
for hhh, freq in get_indexing(flask_login.current_user).http_headers_hashes]
return render_template('hhhashes.html', hhhashes=hhhashes)
@app.route('/favicons', methods=['GET'])
def favicons_lookup() -> str:
favicons = []
for sha512, freq in lookyloo.indexing.favicons:
favicon = lookyloo.indexing.get_favicon(sha512)
for sha512, freq in get_indexing(flask_login.current_user).favicons:
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
if not favicon:
continue
favicon_b64 = base64.b64encode(favicon).decode()
nb_captures = lookyloo.indexing.favicon_number_captures(sha512)
nb_captures = get_indexing(flask_login.current_user).favicon_number_captures(sha512)
favicons.append((sha512, freq, nb_captures, favicon_b64))
return render_template('favicons.html', favicons=favicons)
@ -986,10 +1336,10 @@ def favicons_lookup() -> str:
@app.route('/ressources', methods=['GET'])
def ressources() -> str:
ressources = []
for h, freq in lookyloo.indexing.ressources:
domain_freq = lookyloo.indexing.ressources_number_domains(h)
for h, freq in get_indexing(flask_login.current_user).ressources:
domain_freq = get_indexing(flask_login.current_user).ressources_number_domains(h)
context = lookyloo.context.find_known_content(h)
capture_uuid, url_uuid, hostnode_uuid = lookyloo.indexing.get_hash_uuids(h)
capture_uuid, url_uuid, hostnode_uuid = get_indexing(flask_login.current_user).get_hash_uuids(h)
try:
ressource = lookyloo.get_ressource(capture_uuid, url_uuid, h)
except MissingUUID:
@ -1003,7 +1353,7 @@ def ressources() -> str:
@app.route('/categories', methods=['GET'])
def categories() -> str:
return render_template('categories.html', categories=lookyloo.indexing.categories)
return render_template('categories.html', categories=get_indexing(flask_login.current_user).categories)
@app.route('/rebuild_all')
@ -1057,7 +1407,7 @@ def recapture(tree_uuid: str) -> str | Response | WerkzeugResponse:
@app.route('/ressource_by_hash/<string:sha512>', methods=['GET'])
@file_response # type: ignore[misc]
def ressource_by_hash(sha512: str) -> Response:
details, body = lookyloo.get_body_hash_full(sha512)
details, body = get_body_hash_full(sha512)
return send_file(body, as_attachment=True, download_name='ressource.bin')
@ -1245,13 +1595,13 @@ def capture_web() -> str | Response | WerkzeugResponse:
@app.route('/cookies/<string:cookie_name>', methods=['GET'])
def cookies_name_detail(cookie_name: str) -> str:
captures, domains = lookyloo.get_cookie_name_investigator(cookie_name.strip())
captures, domains = get_cookie_name_investigator(cookie_name.strip())
return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures)
@app.route('/hhhdetails/<string:hhh>', methods=['GET'])
def hhh_detail(hhh: str) -> str:
captures, headers = lookyloo.get_hhh_investigator(hhh.strip())
captures, headers = get_hhh_investigator(hhh.strip())
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)
@ -1259,7 +1609,7 @@ def hhh_detail(hhh: str) -> str:
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
_get_prob = bool(get_probabilistic)
captures, favicon, probabilistic_favicons = lookyloo.get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
captures, favicon, probabilistic_favicons = get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
mimetype, b64_favicon, mmh3_shodan = favicon
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon, mmh3_shodan=mmh3_shodan,
@ -1269,20 +1619,20 @@ def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
def body_hash_details(body_hash: str) -> str:
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
captures, domains = lookyloo.get_body_hash_investigator(body_hash.strip())
captures, domains = _get_body_hash_investigator(body_hash.strip())
return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures, from_popup=from_popup)
@app.route('/urls/<string:url>', methods=['GET'])
def url_details(url: str) -> str:
url = unquote_plus(url).strip()
hits = lookyloo.get_url_occurrences(url, limit=50)
hits = get_url_occurrences(url, limit=50)
return render_template('url.html', url=url, hits=hits)
@app.route('/hostnames/<string:hostname>', methods=['GET'])
def hostname_details(hostname: str) -> str:
hits = lookyloo.get_hostname_occurrences(hostname.strip(), with_urls_occurrences=True, limit=50)
hits = get_hostname_occurrences(hostname.strip(), with_urls_occurrences=True, limit=50)
return render_template('hostname.html', hostname=hostname, hits=hits)

View File

@ -287,7 +287,9 @@ class TriggerModules(Resource): # type: ignore[misc]
params={'h': 'The hash (sha512)'})
class HashInfo(Resource): # type: ignore[misc]
def get(self, h: str) -> dict[str, Any] | tuple[dict[str, Any], int]:
details, body = lookyloo.get_body_hash_full(h)
from . import get_body_hash_full
details, body = get_body_hash_full(h)
if not details:
return {'error': 'Unknown Hash.'}, 400
to_return: dict[str, Any] = {'response': {'hash': h, 'details': details,
@ -308,8 +310,9 @@ class URLInfo(Resource): # type: ignore[misc]
@api.doc(body=url_info_fields) # type: ignore[misc]
def post(self) -> list[dict[str, Any]]:
from . import get_url_occurrences
to_query: dict[str, Any] = request.get_json(force=True)
occurrences = lookyloo.get_url_occurrences(to_query.pop('url'), **to_query)
occurrences = get_url_occurrences(to_query.pop('url'), **to_query)
return occurrences
@ -326,8 +329,9 @@ class HostnameInfo(Resource): # type: ignore[misc]
@api.doc(body=hostname_info_fields) # type: ignore[misc]
def post(self) -> list[dict[str, Any]]:
from . import get_hostname_occurrences
to_query: dict[str, Any] = request.get_json(force=True)
return lookyloo.get_hostname_occurrences(to_query.pop('hostname'), **to_query)
return get_hostname_occurrences(to_query.pop('hostname'), **to_query)
@api.route('/json/stats')