new: TLDs indexing, new format for capture internals indexes

pull/940/head
Raphaël Vinot 2024-09-24 15:40:40 +02:00
parent 193456a7e5
commit b3a4f539b0
6 changed files with 135 additions and 22 deletions

View File

@ -28,8 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
from redis import Redis from redis import Redis
from .context import Context from .context import Context
from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing
from .indexing import Indexing
from .default import LookylooException, try_make_file, get_config from .default import LookylooException, try_make_file, get_config
from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
from .modules import Cloudflare from .modules import Cloudflare
@ -119,7 +118,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel')) self.logger.setLevel(get_config('generic', 'loglevel'))
self.redis = redis self.redis = redis
self.indexing = Indexing()
self.contextualizer = contextualizer self.contextualizer = contextualizer
self.__cache_max_size = maxsize self.__cache_max_size = maxsize
self.__cache: dict[str, CaptureCache] = OrderedDict() self.__cache: dict[str, CaptureCache] = OrderedDict()
@ -363,7 +361,10 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
try: try:
logger.debug('The tree needs to be rebuilt.') logger.debug('The tree needs to be rebuilt.')
tree = self._create_pickle(capture_dir, logger) tree = self._create_pickle(capture_dir, logger)
self.indexing.force_reindex(uuid) # Force the reindexing in the public and full index (if enabled)
get_indexing().force_reindex(uuid)
if get_config('generic', 'index_everything'):
get_indexing(full=True).force_reindex(uuid)
except NoValidHarFile: except NoValidHarFile:
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.') logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
tree = None tree = None

View File

@ -446,9 +446,7 @@ def load_user_config(username: str) -> dict[str, Any] | None:
@cache @cache
def get_indexing(full: bool=False) -> Indexing: def get_indexing(full: bool=False) -> Indexing:
from .indexing import Indexing from .indexing import Indexing
if not get_config('generic', 'index_everything'): if get_config('generic', 'index_everything') and full:
return Indexing()
if full:
return Indexing(full_index=True) return Indexing(full_index=True)
return Indexing() return Indexing()

View File

@ -5,9 +5,10 @@ from __future__ import annotations
import base64 import base64
import hashlib import hashlib
import logging import logging
# import re
from io import BytesIO from io import BytesIO
from collections import defaultdict from collections import defaultdict
from datetime import datetime, timedelta
from urllib.parse import urlsplit from urllib.parse import urlsplit
from zipfile import ZipFile from zipfile import ZipFile
@ -76,13 +77,22 @@ class Indexing():
p.srem('indexed_favicons', capture_uuid) p.srem('indexed_favicons', capture_uuid)
p.srem('indexed_identifiers', capture_uuid) p.srem('indexed_identifiers', capture_uuid)
p.srem('indexed_categories', capture_uuid) p.srem('indexed_categories', capture_uuid)
p.srem('indexed_tlds', capture_uuid)
for identifier_type in self.identifiers_types(): for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types(): for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
# internal_index can be "tlds"
for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'):
# entry can be a "com", we delete a set of UUIDs, remove from the captures set
p.delete(f'capture_indexes|{capture_uuid}|{internal_index}|{entry}')
p.zrem(f'{internal_index}|{entry}|captures', capture_uuid)
p.delete(f'capture_indexes|{capture_uuid}|{internal_index}')
p.delete(f'capture_indexes|{capture_uuid}')
p.execute() p.execute()
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]: def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline() p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid) p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid) p.sismember('indexed_body_hashes', capture_uuid)
@ -91,11 +101,12 @@ class Indexing():
p.sismember('indexed_favicons', capture_uuid) p.sismember('indexed_favicons', capture_uuid)
p.sismember('indexed_identifiers', capture_uuid) p.sismember('indexed_identifiers', capture_uuid)
p.sismember('indexed_categories', capture_uuid) p.sismember('indexed_categories', capture_uuid)
p.sismember('indexed_tlds', capture_uuid)
# We also need to check if the hash_type are all indexed for this capture # We also need to check if the hash_type are all indexed for this capture
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types()) hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
to_return: list[bool] = p.execute() to_return: list[bool] = p.execute()
to_return.append(hash_types_indexed) to_return.append(hash_types_indexed)
# This call for sure returns a tuple of 7 booleans # This call for sure returns a tuple of 8 booleans
return tuple(to_return) # type: ignore[return-value] return tuple(to_return) # type: ignore[return-value]
def index_capture(self, uuid_to_index: str, directory: Path) -> None: def index_capture(self, uuid_to_index: str, directory: Path) -> None:
@ -145,6 +156,9 @@ class Indexing():
self.logger.info(f'Indexing categories for {uuid_to_index}') self.logger.info(f'Indexing categories for {uuid_to_index}')
self.index_categories_capture(uuid_to_index, directory) self.index_categories_capture(uuid_to_index, directory)
if not indexed[7]: if not indexed[7]:
self.logger.info(f'Indexing TLDs for {uuid_to_index}')
self.index_tld_capture(ct)
if not indexed[8]:
self.logger.info(f'Indexing hash types for {uuid_to_index}') self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.index_capture_hashes_types(ct) self.index_capture_hashes_types(ct)
@ -345,7 +359,7 @@ class Indexing():
if 'hhhash' not in urlnode.features: if 'hhhash' not in urlnode.features:
continue continue
if urlnode.hhhash in already_loaded: if urlnode.hhhash in already_loaded:
# Only add cookie name once / capture # Only add HTTP header Hash once / capture
continue continue
already_loaded.add(urlnode.hhhash) already_loaded.add(urlnode.hhhash)
if urlnode.hhhash not in already_cleaned_up: if urlnode.hhhash not in already_cleaned_up:
@ -401,6 +415,65 @@ class Indexing():
def get_captures_hostname(self, hostname: str) -> set[str]: def get_captures_hostname(self, hostname: str) -> set[str]:
return self.redis.smembers(f'hostnames|{hostname}|captures') return self.redis.smembers(f'hostnames|{hostname}|captures')
# ###### TLDs ######
@property
def tlds(self) -> set[str]:
return self.redis.smembers('tlds')
def index_tld_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_tlds', crawled_tree.uuid):
# Do not reindex
return
self.redis.sadd('indexed_tlds', crawled_tree.uuid)
self.logger.debug(f'Indexing TLDs for {crawled_tree.uuid} ... ')
pipeline = self.redis.pipeline()
# Add the tlds key in internal indexes set
internal_index = f'capture_indexes|{crawled_tree.uuid}'
pipeline.sadd(internal_index, 'tlds')
already_indexed_global: set[str] = set()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if not hasattr(urlnode, 'known_tld'):
# No TLD in the node.
continue
if urlnode.known_tld not in already_indexed_global:
# TLD hasn't been indexed in that run yet
already_indexed_global.add(urlnode.known_tld)
pipeline.sadd(f'{internal_index}|tlds', urlnode.known_tld) # Only used to delete index
pipeline.sadd('tlds', urlnode.known_tld)
pipeline.zadd(f'tlds|{urlnode.known_tld}|captures',
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
# Add hostnode UUID in internal index
pipeline.sadd(f'{internal_index}|tlds|{urlnode.known_tld}', urlnode.uuid)
pipeline.execute()
self.logger.debug(f'done with TLDs for {crawled_tree.uuid}.')
def get_captures_tld(self, tld: str, most_recent_capture: datetime | None = None,
oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
"""Get all the captures for a specific TLD, on a time interval starting from the most recent one.
:param tld: The TLD
:param most_recent_capture: The capture time of the most recent capture to consider
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
return self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True)
def get_capture_tld_counter(self, capture_uuid: str, tld: str) -> int:
# NOTE: what to do when the capture isn't indexed yet? Raise an exception?
# For now, return 0
return self.redis.scard(f'capture_indexes|{capture_uuid}|tlds|{tld}')
def get_capture_tld_nodes(self, capture_uuid: str, tld: str) -> set[str]:
if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|tlds|{tld}'):
return set(url_nodes)
return set()
# ###### favicons ###### # ###### favicons ######
@property @property

View File

@ -223,11 +223,21 @@ class Lookyloo():
ct = self.get_crawled_tree(capture_uuid) ct = self.get_crawled_tree(capture_uuid)
return ct.root_hartree.get_url_node_by_uuid(node_uuid) return ct.root_hartree.get_url_node_by_uuid(node_uuid)
def get_urlnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[URLNode]:
'''Get a list of URL nodes from a tree, by UUID'''
ct = self.get_crawled_tree(capture_uuid)
return [ct.root_hartree.get_url_node_by_uuid(node_uuid) for node_uuid in node_uuids]
def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode: def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode:
'''Get a host node from a tree, by UUID''' '''Get a host node from a tree, by UUID'''
ct = self.get_crawled_tree(capture_uuid) ct = self.get_crawled_tree(capture_uuid)
return ct.root_hartree.get_host_node_by_uuid(node_uuid) return ct.root_hartree.get_host_node_by_uuid(node_uuid)
def get_hostnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[HostNode]:
'''Get a list of host nodes from a tree, by UUID'''
ct = self.get_crawled_tree(capture_uuid)
return [ct.root_hartree.get_host_node_by_uuid(node_uuid) for node_uuid in node_uuids]
def get_statistics(self, capture_uuid: str, /) -> dict[str, Any]: def get_statistics(self, capture_uuid: str, /) -> dict[str, Any]:
'''Get the statistics of a capture.''' '''Get the statistics of a capture.'''
ct = self.get_crawled_tree(capture_uuid) ct = self.get_crawled_tree(capture_uuid)

View File

@ -6,6 +6,8 @@ import base64
import gzip import gzip
import hashlib import hashlib
import json import json
import logging
import logging.config
from io import BytesIO from io import BytesIO
from typing import Any from typing import Any
@ -20,6 +22,7 @@ from werkzeug.security import check_password_hash
from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError
from pylacus import CaptureStatus as CaptureStatusPy from pylacus import CaptureStatus as CaptureStatusPy
from lookyloo import CaptureSettings, Lookyloo from lookyloo import CaptureSettings, Lookyloo
from lookyloo.default import get_config
from lookyloo.comparator import Comparator from lookyloo.comparator import Comparator
from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.helpers import load_user_config from lookyloo.helpers import load_user_config
@ -31,6 +34,7 @@ api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')
lookyloo: Lookyloo = get_lookyloo_instance() lookyloo: Lookyloo = get_lookyloo_instance()
comparator: Comparator = Comparator() comparator: Comparator = Comparator()
logging.config.dictConfig(get_config('logging'))
def api_auth_check(method): # type: ignore[no-untyped-def] def api_auth_check(method): # type: ignore[no-untyped-def]
@ -825,3 +829,37 @@ class CategoriesCaptures(Resource): # type: ignore[misc]
return list(get_indexing(flask_login.current_user).get_captures_category(category)) return list(get_indexing(flask_login.current_user).get_captures_category(category))
return {c: list(get_indexing(flask_login.current_user).get_captures_category(c)) return {c: list(get_indexing(flask_login.current_user).get_captures_category(c))
for c in existing_categories} for c in existing_categories}
# NOTE: there are a few extra paramaters we may want to add in the future: most recent/oldest capture
@api.route('/json/tlds')
@api.doc(description='Get captures with hits on a specific TLD, to TLD returns the a list of most frequent TLDs.')
class TLDCaptures(Resource): # type: ignore[misc]
@api.param('tld', 'Get captures with a specific TLD and their capture timestamp.') # type: ignore[misc]
@api.param('urls_only', 'Returns recent URLs with that TLD, regardless the capture.') # type: ignore[misc]
def get(self) -> list[tuple[str, float]] | list[str]:
tld: str | None = request.args['tld'] if request.args.get('tld') else None
urls_only: bool | None = True if request.args.get('urls_only') else None
if not tld:
return list(get_indexing(flask_login.current_user).tlds)
recent_captures_with_tld = get_indexing(flask_login.current_user).get_captures_tld(tld)
if not recent_captures_with_tld:
return []
if not urls_only:
return recent_captures_with_tld
# get the capture, get the node uuids, get the names, make it a list
to_return: set[str] = set()
# Make sure to only get the captures with a pickle ready
cache = lookyloo.sorted_capture_cache([uuid for uuid, _ in recent_captures_with_tld], cached_captures_only=True)
for c in cache:
uuid = c.uuid
nodes_with_tld = get_indexing(flask_login.current_user).get_capture_tld_nodes(uuid, tld)
try:
to_return.update(node.name for node in lookyloo.get_urlnodes_from_tree(uuid, nodes_with_tld))
except IndexError:
# The capture needs to be re-indexed
# NOTE: If this warning it printed on a loop for a capture, we have a problem with the index.
logging.warning(f'Capture {uuid} needs to be re-indexed.')
get_indexing(flask_login.current_user).force_reindex(uuid)
return list(to_return)

View File

@ -6,7 +6,7 @@ import hashlib
import json import json
import os import os
import re import re
from functools import lru_cache, cache from functools import lru_cache
from pathlib import Path from pathlib import Path
import flask_login # type: ignore[import-untyped] import flask_login # type: ignore[import-untyped]
@ -14,6 +14,7 @@ from flask import Request
from werkzeug.security import generate_password_hash from werkzeug.security import generate_password_hash
from lookyloo import Lookyloo, Indexing from lookyloo import Lookyloo, Indexing
from lookyloo.helpers import get_indexing as get_indexing_cache
from lookyloo.default import get_config, get_homedir, LookylooException from lookyloo.default import get_config, get_homedir, LookylooException
__global_lookyloo_instance = None __global_lookyloo_instance = None
@ -118,18 +119,10 @@ def sri_load() -> dict[str, dict[str, str]]:
return json.load(f) return json.load(f)
@cache
def get_indexing(user: User | None) -> Indexing: def get_indexing(user: User | None) -> Indexing:
'''Depending if we're logged in or not, we (can) get different indexes: '''Depending if we're logged in or not, we (can) get different indexes:
if index_everything is enabled, we have an index in kvrocks that contains all if index_everything is enabled, we have an index in kvrocks that contains all
the indexes for all the captures. the indexes for all the captures.
It is only accessible to the admin user. It is only accessible to the admin user.
''' '''
if not get_config('generic', 'index_everything'): return get_indexing_cache(full=bool(user and user.is_authenticated))
return Indexing()
if not user or not user.is_authenticated:
# No user or anonymous
return Indexing()
# Logged in user
return Indexing(full_index=True)