mirror of https://github.com/CIRCL/lookyloo
new: TLDs indexing, new format for capture internals indexes
parent
193456a7e5
commit
b3a4f539b0
|
@ -28,8 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
|
|
||||||
from .context import Context
|
from .context import Context
|
||||||
from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
|
from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing
|
||||||
from .indexing import Indexing
|
|
||||||
from .default import LookylooException, try_make_file, get_config
|
from .default import LookylooException, try_make_file, get_config
|
||||||
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
|
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
|
||||||
from .modules import Cloudflare
|
from .modules import Cloudflare
|
||||||
|
@ -119,7 +118,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
|
||||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
self.logger.setLevel(get_config('generic', 'loglevel'))
|
||||||
self.redis = redis
|
self.redis = redis
|
||||||
self.indexing = Indexing()
|
|
||||||
self.contextualizer = contextualizer
|
self.contextualizer = contextualizer
|
||||||
self.__cache_max_size = maxsize
|
self.__cache_max_size = maxsize
|
||||||
self.__cache: dict[str, CaptureCache] = OrderedDict()
|
self.__cache: dict[str, CaptureCache] = OrderedDict()
|
||||||
|
@ -363,7 +361,10 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
|
||||||
try:
|
try:
|
||||||
logger.debug('The tree needs to be rebuilt.')
|
logger.debug('The tree needs to be rebuilt.')
|
||||||
tree = self._create_pickle(capture_dir, logger)
|
tree = self._create_pickle(capture_dir, logger)
|
||||||
self.indexing.force_reindex(uuid)
|
# Force the reindexing in the public and full index (if enabled)
|
||||||
|
get_indexing().force_reindex(uuid)
|
||||||
|
if get_config('generic', 'index_everything'):
|
||||||
|
get_indexing(full=True).force_reindex(uuid)
|
||||||
except NoValidHarFile:
|
except NoValidHarFile:
|
||||||
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
|
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
|
||||||
tree = None
|
tree = None
|
||||||
|
|
|
@ -446,9 +446,7 @@ def load_user_config(username: str) -> dict[str, Any] | None:
|
||||||
@cache
|
@cache
|
||||||
def get_indexing(full: bool=False) -> Indexing:
|
def get_indexing(full: bool=False) -> Indexing:
|
||||||
from .indexing import Indexing
|
from .indexing import Indexing
|
||||||
if not get_config('generic', 'index_everything'):
|
if get_config('generic', 'index_everything') and full:
|
||||||
return Indexing()
|
|
||||||
if full:
|
|
||||||
return Indexing(full_index=True)
|
return Indexing(full_index=True)
|
||||||
return Indexing()
|
return Indexing()
|
||||||
|
|
||||||
|
|
|
@ -5,9 +5,10 @@ from __future__ import annotations
|
||||||
import base64
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
# import re
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timedelta
|
||||||
from urllib.parse import urlsplit
|
from urllib.parse import urlsplit
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
@ -76,13 +77,22 @@ class Indexing():
|
||||||
p.srem('indexed_favicons', capture_uuid)
|
p.srem('indexed_favicons', capture_uuid)
|
||||||
p.srem('indexed_identifiers', capture_uuid)
|
p.srem('indexed_identifiers', capture_uuid)
|
||||||
p.srem('indexed_categories', capture_uuid)
|
p.srem('indexed_categories', capture_uuid)
|
||||||
|
p.srem('indexed_tlds', capture_uuid)
|
||||||
for identifier_type in self.identifiers_types():
|
for identifier_type in self.identifiers_types():
|
||||||
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
|
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
|
||||||
for hash_type in self.captures_hashes_types():
|
for hash_type in self.captures_hashes_types():
|
||||||
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
|
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
|
||||||
|
for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
|
||||||
|
# internal_index can be "tlds"
|
||||||
|
for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'):
|
||||||
|
# entry can be a "com", we delete a set of UUIDs, remove from the captures set
|
||||||
|
p.delete(f'capture_indexes|{capture_uuid}|{internal_index}|{entry}')
|
||||||
|
p.zrem(f'{internal_index}|{entry}|captures', capture_uuid)
|
||||||
|
p.delete(f'capture_indexes|{capture_uuid}|{internal_index}')
|
||||||
|
p.delete(f'capture_indexes|{capture_uuid}')
|
||||||
p.execute()
|
p.execute()
|
||||||
|
|
||||||
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]:
|
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool]:
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
p.sismember('indexed_urls', capture_uuid)
|
p.sismember('indexed_urls', capture_uuid)
|
||||||
p.sismember('indexed_body_hashes', capture_uuid)
|
p.sismember('indexed_body_hashes', capture_uuid)
|
||||||
|
@ -91,11 +101,12 @@ class Indexing():
|
||||||
p.sismember('indexed_favicons', capture_uuid)
|
p.sismember('indexed_favicons', capture_uuid)
|
||||||
p.sismember('indexed_identifiers', capture_uuid)
|
p.sismember('indexed_identifiers', capture_uuid)
|
||||||
p.sismember('indexed_categories', capture_uuid)
|
p.sismember('indexed_categories', capture_uuid)
|
||||||
|
p.sismember('indexed_tlds', capture_uuid)
|
||||||
# We also need to check if the hash_type are all indexed for this capture
|
# We also need to check if the hash_type are all indexed for this capture
|
||||||
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
|
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
|
||||||
to_return: list[bool] = p.execute()
|
to_return: list[bool] = p.execute()
|
||||||
to_return.append(hash_types_indexed)
|
to_return.append(hash_types_indexed)
|
||||||
# This call for sure returns a tuple of 7 booleans
|
# This call for sure returns a tuple of 8 booleans
|
||||||
return tuple(to_return) # type: ignore[return-value]
|
return tuple(to_return) # type: ignore[return-value]
|
||||||
|
|
||||||
def index_capture(self, uuid_to_index: str, directory: Path) -> None:
|
def index_capture(self, uuid_to_index: str, directory: Path) -> None:
|
||||||
|
@ -145,6 +156,9 @@ class Indexing():
|
||||||
self.logger.info(f'Indexing categories for {uuid_to_index}')
|
self.logger.info(f'Indexing categories for {uuid_to_index}')
|
||||||
self.index_categories_capture(uuid_to_index, directory)
|
self.index_categories_capture(uuid_to_index, directory)
|
||||||
if not indexed[7]:
|
if not indexed[7]:
|
||||||
|
self.logger.info(f'Indexing TLDs for {uuid_to_index}')
|
||||||
|
self.index_tld_capture(ct)
|
||||||
|
if not indexed[8]:
|
||||||
self.logger.info(f'Indexing hash types for {uuid_to_index}')
|
self.logger.info(f'Indexing hash types for {uuid_to_index}')
|
||||||
self.index_capture_hashes_types(ct)
|
self.index_capture_hashes_types(ct)
|
||||||
|
|
||||||
|
@ -345,7 +359,7 @@ class Indexing():
|
||||||
if 'hhhash' not in urlnode.features:
|
if 'hhhash' not in urlnode.features:
|
||||||
continue
|
continue
|
||||||
if urlnode.hhhash in already_loaded:
|
if urlnode.hhhash in already_loaded:
|
||||||
# Only add cookie name once / capture
|
# Only add HTTP header Hash once / capture
|
||||||
continue
|
continue
|
||||||
already_loaded.add(urlnode.hhhash)
|
already_loaded.add(urlnode.hhhash)
|
||||||
if urlnode.hhhash not in already_cleaned_up:
|
if urlnode.hhhash not in already_cleaned_up:
|
||||||
|
@ -401,6 +415,65 @@ class Indexing():
|
||||||
def get_captures_hostname(self, hostname: str) -> set[str]:
|
def get_captures_hostname(self, hostname: str) -> set[str]:
|
||||||
return self.redis.smembers(f'hostnames|{hostname}|captures')
|
return self.redis.smembers(f'hostnames|{hostname}|captures')
|
||||||
|
|
||||||
|
# ###### TLDs ######
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tlds(self) -> set[str]:
|
||||||
|
return self.redis.smembers('tlds')
|
||||||
|
|
||||||
|
def index_tld_capture(self, crawled_tree: CrawledTree) -> None:
|
||||||
|
if self.redis.sismember('indexed_tlds', crawled_tree.uuid):
|
||||||
|
# Do not reindex
|
||||||
|
return
|
||||||
|
self.redis.sadd('indexed_tlds', crawled_tree.uuid)
|
||||||
|
self.logger.debug(f'Indexing TLDs for {crawled_tree.uuid} ... ')
|
||||||
|
pipeline = self.redis.pipeline()
|
||||||
|
|
||||||
|
# Add the tlds key in internal indexes set
|
||||||
|
internal_index = f'capture_indexes|{crawled_tree.uuid}'
|
||||||
|
pipeline.sadd(internal_index, 'tlds')
|
||||||
|
|
||||||
|
already_indexed_global: set[str] = set()
|
||||||
|
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
||||||
|
if not hasattr(urlnode, 'known_tld'):
|
||||||
|
# No TLD in the node.
|
||||||
|
continue
|
||||||
|
if urlnode.known_tld not in already_indexed_global:
|
||||||
|
# TLD hasn't been indexed in that run yet
|
||||||
|
already_indexed_global.add(urlnode.known_tld)
|
||||||
|
pipeline.sadd(f'{internal_index}|tlds', urlnode.known_tld) # Only used to delete index
|
||||||
|
pipeline.sadd('tlds', urlnode.known_tld)
|
||||||
|
pipeline.zadd(f'tlds|{urlnode.known_tld}|captures',
|
||||||
|
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
|
||||||
|
|
||||||
|
# Add hostnode UUID in internal index
|
||||||
|
pipeline.sadd(f'{internal_index}|tlds|{urlnode.known_tld}', urlnode.uuid)
|
||||||
|
|
||||||
|
pipeline.execute()
|
||||||
|
self.logger.debug(f'done with TLDs for {crawled_tree.uuid}.')
|
||||||
|
|
||||||
|
def get_captures_tld(self, tld: str, most_recent_capture: datetime | None = None,
|
||||||
|
oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
|
||||||
|
"""Get all the captures for a specific TLD, on a time interval starting from the most recent one.
|
||||||
|
|
||||||
|
:param tld: The TLD
|
||||||
|
:param most_recent_capture: The capture time of the most recent capture to consider
|
||||||
|
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
|
||||||
|
"""
|
||||||
|
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
|
||||||
|
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
|
||||||
|
return self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True)
|
||||||
|
|
||||||
|
def get_capture_tld_counter(self, capture_uuid: str, tld: str) -> int:
|
||||||
|
# NOTE: what to do when the capture isn't indexed yet? Raise an exception?
|
||||||
|
# For now, return 0
|
||||||
|
return self.redis.scard(f'capture_indexes|{capture_uuid}|tlds|{tld}')
|
||||||
|
|
||||||
|
def get_capture_tld_nodes(self, capture_uuid: str, tld: str) -> set[str]:
|
||||||
|
if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|tlds|{tld}'):
|
||||||
|
return set(url_nodes)
|
||||||
|
return set()
|
||||||
|
|
||||||
# ###### favicons ######
|
# ###### favicons ######
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -223,11 +223,21 @@ class Lookyloo():
|
||||||
ct = self.get_crawled_tree(capture_uuid)
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
return ct.root_hartree.get_url_node_by_uuid(node_uuid)
|
return ct.root_hartree.get_url_node_by_uuid(node_uuid)
|
||||||
|
|
||||||
|
def get_urlnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[URLNode]:
|
||||||
|
'''Get a list of URL nodes from a tree, by UUID'''
|
||||||
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
|
return [ct.root_hartree.get_url_node_by_uuid(node_uuid) for node_uuid in node_uuids]
|
||||||
|
|
||||||
def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode:
|
def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode:
|
||||||
'''Get a host node from a tree, by UUID'''
|
'''Get a host node from a tree, by UUID'''
|
||||||
ct = self.get_crawled_tree(capture_uuid)
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
return ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
return ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
||||||
|
|
||||||
|
def get_hostnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[HostNode]:
|
||||||
|
'''Get a list of host nodes from a tree, by UUID'''
|
||||||
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
|
return [ct.root_hartree.get_host_node_by_uuid(node_uuid) for node_uuid in node_uuids]
|
||||||
|
|
||||||
def get_statistics(self, capture_uuid: str, /) -> dict[str, Any]:
|
def get_statistics(self, capture_uuid: str, /) -> dict[str, Any]:
|
||||||
'''Get the statistics of a capture.'''
|
'''Get the statistics of a capture.'''
|
||||||
ct = self.get_crawled_tree(capture_uuid)
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
|
|
|
@ -6,6 +6,8 @@ import base64
|
||||||
import gzip
|
import gzip
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
import logging.config
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
@ -20,6 +22,7 @@ from werkzeug.security import check_password_hash
|
||||||
from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError
|
from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError
|
||||||
from pylacus import CaptureStatus as CaptureStatusPy
|
from pylacus import CaptureStatus as CaptureStatusPy
|
||||||
from lookyloo import CaptureSettings, Lookyloo
|
from lookyloo import CaptureSettings, Lookyloo
|
||||||
|
from lookyloo.default import get_config
|
||||||
from lookyloo.comparator import Comparator
|
from lookyloo.comparator import Comparator
|
||||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||||
from lookyloo.helpers import load_user_config
|
from lookyloo.helpers import load_user_config
|
||||||
|
@ -31,6 +34,7 @@ api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')
|
||||||
|
|
||||||
lookyloo: Lookyloo = get_lookyloo_instance()
|
lookyloo: Lookyloo = get_lookyloo_instance()
|
||||||
comparator: Comparator = Comparator()
|
comparator: Comparator = Comparator()
|
||||||
|
logging.config.dictConfig(get_config('logging'))
|
||||||
|
|
||||||
|
|
||||||
def api_auth_check(method): # type: ignore[no-untyped-def]
|
def api_auth_check(method): # type: ignore[no-untyped-def]
|
||||||
|
@ -825,3 +829,37 @@ class CategoriesCaptures(Resource): # type: ignore[misc]
|
||||||
return list(get_indexing(flask_login.current_user).get_captures_category(category))
|
return list(get_indexing(flask_login.current_user).get_captures_category(category))
|
||||||
return {c: list(get_indexing(flask_login.current_user).get_captures_category(c))
|
return {c: list(get_indexing(flask_login.current_user).get_captures_category(c))
|
||||||
for c in existing_categories}
|
for c in existing_categories}
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE: there are a few extra paramaters we may want to add in the future: most recent/oldest capture
|
||||||
|
@api.route('/json/tlds')
|
||||||
|
@api.doc(description='Get captures with hits on a specific TLD, to TLD returns the a list of most frequent TLDs.')
|
||||||
|
class TLDCaptures(Resource): # type: ignore[misc]
|
||||||
|
|
||||||
|
@api.param('tld', 'Get captures with a specific TLD and their capture timestamp.') # type: ignore[misc]
|
||||||
|
@api.param('urls_only', 'Returns recent URLs with that TLD, regardless the capture.') # type: ignore[misc]
|
||||||
|
def get(self) -> list[tuple[str, float]] | list[str]:
|
||||||
|
tld: str | None = request.args['tld'] if request.args.get('tld') else None
|
||||||
|
urls_only: bool | None = True if request.args.get('urls_only') else None
|
||||||
|
if not tld:
|
||||||
|
return list(get_indexing(flask_login.current_user).tlds)
|
||||||
|
recent_captures_with_tld = get_indexing(flask_login.current_user).get_captures_tld(tld)
|
||||||
|
if not recent_captures_with_tld:
|
||||||
|
return []
|
||||||
|
if not urls_only:
|
||||||
|
return recent_captures_with_tld
|
||||||
|
# get the capture, get the node uuids, get the names, make it a list
|
||||||
|
to_return: set[str] = set()
|
||||||
|
# Make sure to only get the captures with a pickle ready
|
||||||
|
cache = lookyloo.sorted_capture_cache([uuid for uuid, _ in recent_captures_with_tld], cached_captures_only=True)
|
||||||
|
for c in cache:
|
||||||
|
uuid = c.uuid
|
||||||
|
nodes_with_tld = get_indexing(flask_login.current_user).get_capture_tld_nodes(uuid, tld)
|
||||||
|
try:
|
||||||
|
to_return.update(node.name for node in lookyloo.get_urlnodes_from_tree(uuid, nodes_with_tld))
|
||||||
|
except IndexError:
|
||||||
|
# The capture needs to be re-indexed
|
||||||
|
# NOTE: If this warning it printed on a loop for a capture, we have a problem with the index.
|
||||||
|
logging.warning(f'Capture {uuid} needs to be re-indexed.')
|
||||||
|
get_indexing(flask_login.current_user).force_reindex(uuid)
|
||||||
|
return list(to_return)
|
||||||
|
|
|
@ -6,7 +6,7 @@ import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from functools import lru_cache, cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import flask_login # type: ignore[import-untyped]
|
import flask_login # type: ignore[import-untyped]
|
||||||
|
@ -14,6 +14,7 @@ from flask import Request
|
||||||
from werkzeug.security import generate_password_hash
|
from werkzeug.security import generate_password_hash
|
||||||
|
|
||||||
from lookyloo import Lookyloo, Indexing
|
from lookyloo import Lookyloo, Indexing
|
||||||
|
from lookyloo.helpers import get_indexing as get_indexing_cache
|
||||||
from lookyloo.default import get_config, get_homedir, LookylooException
|
from lookyloo.default import get_config, get_homedir, LookylooException
|
||||||
|
|
||||||
__global_lookyloo_instance = None
|
__global_lookyloo_instance = None
|
||||||
|
@ -118,18 +119,10 @@ def sri_load() -> dict[str, dict[str, str]]:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
@cache
|
|
||||||
def get_indexing(user: User | None) -> Indexing:
|
def get_indexing(user: User | None) -> Indexing:
|
||||||
'''Depending if we're logged in or not, we (can) get different indexes:
|
'''Depending if we're logged in or not, we (can) get different indexes:
|
||||||
if index_everything is enabled, we have an index in kvrocks that contains all
|
if index_everything is enabled, we have an index in kvrocks that contains all
|
||||||
the indexes for all the captures.
|
the indexes for all the captures.
|
||||||
It is only accessible to the admin user.
|
It is only accessible to the admin user.
|
||||||
'''
|
'''
|
||||||
if not get_config('generic', 'index_everything'):
|
return get_indexing_cache(full=bool(user and user.is_authenticated))
|
||||||
return Indexing()
|
|
||||||
|
|
||||||
if not user or not user.is_authenticated:
|
|
||||||
# No user or anonymous
|
|
||||||
return Indexing()
|
|
||||||
# Logged in user
|
|
||||||
return Indexing(full_index=True)
|
|
||||||
|
|
Loading…
Reference in New Issue