From b3a4f539b03f254258ba6ce9cb9b5bc374bd75d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 24 Sep 2024 15:40:40 +0200 Subject: [PATCH] new: TLDs indexing, new format for capture internals indexes --- lookyloo/capturecache.py | 9 +++-- lookyloo/helpers.py | 4 +- lookyloo/indexing.py | 81 +++++++++++++++++++++++++++++++++++++-- lookyloo/lookyloo.py | 10 +++++ website/web/genericapi.py | 40 ++++++++++++++++++- website/web/helpers.py | 13 ++----- 6 files changed, 135 insertions(+), 22 deletions(-) diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index 9ff76946..5f0179cc 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -28,8 +28,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined] from redis import Redis from .context import Context -from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree -from .indexing import Indexing +from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing from .default import LookylooException, try_make_file, get_config from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild from .modules import Cloudflare @@ -119,7 +118,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg] self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) self.redis = redis - self.indexing = Indexing() self.contextualizer = contextualizer self.__cache_max_size = maxsize self.__cache: dict[str, CaptureCache] = OrderedDict() @@ -363,7 +361,10 @@ class CapturesIndex(Mapping): # type: ignore[type-arg] try: logger.debug('The tree needs to be rebuilt.') tree = self._create_pickle(capture_dir, logger) - self.indexing.force_reindex(uuid) + # Force the reindexing in the public and full index (if enabled) + get_indexing().force_reindex(uuid) + if get_config('generic', 'index_everything'): + get_indexing(full=True).force_reindex(uuid) except NoValidHarFile: logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.') tree = None diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index ed0a5c91..bae21219 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -446,9 +446,7 @@ def load_user_config(username: str) -> dict[str, Any] | None: @cache def get_indexing(full: bool=False) -> Indexing: from .indexing import Indexing - if not get_config('generic', 'index_everything'): - return Indexing() - if full: + if get_config('generic', 'index_everything') and full: return Indexing(full_index=True) return Indexing() diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 2238acfd..93979c94 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -5,9 +5,10 @@ from __future__ import annotations import base64 import hashlib import logging -# import re + from io import BytesIO from collections import defaultdict +from datetime import datetime, timedelta from urllib.parse import urlsplit from zipfile import ZipFile @@ -76,13 +77,22 @@ class Indexing(): p.srem('indexed_favicons', capture_uuid) p.srem('indexed_identifiers', capture_uuid) p.srem('indexed_categories', capture_uuid) + p.srem('indexed_tlds', capture_uuid) for identifier_type in self.identifiers_types(): p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) for hash_type in self.captures_hashes_types(): p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) + for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'): + # internal_index can be "tlds" + for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'): + # entry can be a "com", we delete a set of UUIDs, remove from the captures set + p.delete(f'capture_indexes|{capture_uuid}|{internal_index}|{entry}') + p.zrem(f'{internal_index}|{entry}|captures', capture_uuid) + p.delete(f'capture_indexes|{capture_uuid}|{internal_index}') + p.delete(f'capture_indexes|{capture_uuid}') p.execute() - def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]: + def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool]: p = self.redis.pipeline() p.sismember('indexed_urls', capture_uuid) p.sismember('indexed_body_hashes', capture_uuid) @@ -91,11 +101,12 @@ class Indexing(): p.sismember('indexed_favicons', capture_uuid) p.sismember('indexed_identifiers', capture_uuid) p.sismember('indexed_categories', capture_uuid) + p.sismember('indexed_tlds', capture_uuid) # We also need to check if the hash_type are all indexed for this capture hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types()) to_return: list[bool] = p.execute() to_return.append(hash_types_indexed) - # This call for sure returns a tuple of 7 booleans + # This call for sure returns a tuple of 8 booleans return tuple(to_return) # type: ignore[return-value] def index_capture(self, uuid_to_index: str, directory: Path) -> None: @@ -145,6 +156,9 @@ class Indexing(): self.logger.info(f'Indexing categories for {uuid_to_index}') self.index_categories_capture(uuid_to_index, directory) if not indexed[7]: + self.logger.info(f'Indexing TLDs for {uuid_to_index}') + self.index_tld_capture(ct) + if not indexed[8]: self.logger.info(f'Indexing hash types for {uuid_to_index}') self.index_capture_hashes_types(ct) @@ -345,7 +359,7 @@ class Indexing(): if 'hhhash' not in urlnode.features: continue if urlnode.hhhash in already_loaded: - # Only add cookie name once / capture + # Only add HTTP header Hash once / capture continue already_loaded.add(urlnode.hhhash) if urlnode.hhhash not in already_cleaned_up: @@ -401,6 +415,65 @@ class Indexing(): def get_captures_hostname(self, hostname: str) -> set[str]: return self.redis.smembers(f'hostnames|{hostname}|captures') + # ###### TLDs ###### + + @property + def tlds(self) -> set[str]: + return self.redis.smembers('tlds') + + def index_tld_capture(self, crawled_tree: CrawledTree) -> None: + if self.redis.sismember('indexed_tlds', crawled_tree.uuid): + # Do not reindex + return + self.redis.sadd('indexed_tlds', crawled_tree.uuid) + self.logger.debug(f'Indexing TLDs for {crawled_tree.uuid} ... ') + pipeline = self.redis.pipeline() + + # Add the tlds key in internal indexes set + internal_index = f'capture_indexes|{crawled_tree.uuid}' + pipeline.sadd(internal_index, 'tlds') + + already_indexed_global: set[str] = set() + for urlnode in crawled_tree.root_hartree.url_tree.traverse(): + if not hasattr(urlnode, 'known_tld'): + # No TLD in the node. + continue + if urlnode.known_tld not in already_indexed_global: + # TLD hasn't been indexed in that run yet + already_indexed_global.add(urlnode.known_tld) + pipeline.sadd(f'{internal_index}|tlds', urlnode.known_tld) # Only used to delete index + pipeline.sadd('tlds', urlnode.known_tld) + pipeline.zadd(f'tlds|{urlnode.known_tld}|captures', + mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()}) + + # Add hostnode UUID in internal index + pipeline.sadd(f'{internal_index}|tlds|{urlnode.known_tld}', urlnode.uuid) + + pipeline.execute() + self.logger.debug(f'done with TLDs for {crawled_tree.uuid}.') + + def get_captures_tld(self, tld: str, most_recent_capture: datetime | None = None, + oldest_capture: datetime | None= None) -> list[tuple[str, float]]: + """Get all the captures for a specific TLD, on a time interval starting from the most recent one. + + :param tld: The TLD + :param most_recent_capture: The capture time of the most recent capture to consider + :param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago. + """ + max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf' + min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp() + return self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True) + + def get_capture_tld_counter(self, capture_uuid: str, tld: str) -> int: + # NOTE: what to do when the capture isn't indexed yet? Raise an exception? + # For now, return 0 + return self.redis.scard(f'capture_indexes|{capture_uuid}|tlds|{tld}') + + def get_capture_tld_nodes(self, capture_uuid: str, tld: str) -> set[str]: + if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|tlds|{tld}'): + return set(url_nodes) + return set() + # ###### favicons ###### @property diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 78461dcd..5f8d7db3 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -223,11 +223,21 @@ class Lookyloo(): ct = self.get_crawled_tree(capture_uuid) return ct.root_hartree.get_url_node_by_uuid(node_uuid) + def get_urlnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[URLNode]: + '''Get a list of URL nodes from a tree, by UUID''' + ct = self.get_crawled_tree(capture_uuid) + return [ct.root_hartree.get_url_node_by_uuid(node_uuid) for node_uuid in node_uuids] + def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode: '''Get a host node from a tree, by UUID''' ct = self.get_crawled_tree(capture_uuid) return ct.root_hartree.get_host_node_by_uuid(node_uuid) + def get_hostnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[HostNode]: + '''Get a list of host nodes from a tree, by UUID''' + ct = self.get_crawled_tree(capture_uuid) + return [ct.root_hartree.get_host_node_by_uuid(node_uuid) for node_uuid in node_uuids] + def get_statistics(self, capture_uuid: str, /) -> dict[str, Any]: '''Get the statistics of a capture.''' ct = self.get_crawled_tree(capture_uuid) diff --git a/website/web/genericapi.py b/website/web/genericapi.py index 7d8a5e31..a565d6c4 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -6,6 +6,8 @@ import base64 import gzip import hashlib import json +import logging +import logging.config from io import BytesIO from typing import Any @@ -20,6 +22,7 @@ from werkzeug.security import check_password_hash from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError from pylacus import CaptureStatus as CaptureStatusPy from lookyloo import CaptureSettings, Lookyloo +from lookyloo.default import get_config from lookyloo.comparator import Comparator from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.helpers import load_user_config @@ -31,6 +34,7 @@ api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/') lookyloo: Lookyloo = get_lookyloo_instance() comparator: Comparator = Comparator() +logging.config.dictConfig(get_config('logging')) def api_auth_check(method): # type: ignore[no-untyped-def] @@ -784,7 +788,7 @@ class CaptureHide(Resource): # type: ignore[misc] except Exception as e: return {'error': f'Unable to hide the tree: {e}'}, 400 return {'info': f'Capture {capture_uuid} successfully hidden.'} - + @api.route('/admin//remove') @api.doc(description='Remove the capture from the index.', @@ -825,3 +829,37 @@ class CategoriesCaptures(Resource): # type: ignore[misc] return list(get_indexing(flask_login.current_user).get_captures_category(category)) return {c: list(get_indexing(flask_login.current_user).get_captures_category(c)) for c in existing_categories} + + +# NOTE: there are a few extra paramaters we may want to add in the future: most recent/oldest capture +@api.route('/json/tlds') +@api.doc(description='Get captures with hits on a specific TLD, to TLD returns the a list of most frequent TLDs.') +class TLDCaptures(Resource): # type: ignore[misc] + + @api.param('tld', 'Get captures with a specific TLD and their capture timestamp.') # type: ignore[misc] + @api.param('urls_only', 'Returns recent URLs with that TLD, regardless the capture.') # type: ignore[misc] + def get(self) -> list[tuple[str, float]] | list[str]: + tld: str | None = request.args['tld'] if request.args.get('tld') else None + urls_only: bool | None = True if request.args.get('urls_only') else None + if not tld: + return list(get_indexing(flask_login.current_user).tlds) + recent_captures_with_tld = get_indexing(flask_login.current_user).get_captures_tld(tld) + if not recent_captures_with_tld: + return [] + if not urls_only: + return recent_captures_with_tld + # get the capture, get the node uuids, get the names, make it a list + to_return: set[str] = set() + # Make sure to only get the captures with a pickle ready + cache = lookyloo.sorted_capture_cache([uuid for uuid, _ in recent_captures_with_tld], cached_captures_only=True) + for c in cache: + uuid = c.uuid + nodes_with_tld = get_indexing(flask_login.current_user).get_capture_tld_nodes(uuid, tld) + try: + to_return.update(node.name for node in lookyloo.get_urlnodes_from_tree(uuid, nodes_with_tld)) + except IndexError: + # The capture needs to be re-indexed + # NOTE: If this warning it printed on a loop for a capture, we have a problem with the index. + logging.warning(f'Capture {uuid} needs to be re-indexed.') + get_indexing(flask_login.current_user).force_reindex(uuid) + return list(to_return) diff --git a/website/web/helpers.py b/website/web/helpers.py index ba3841f2..7e0c6f24 100644 --- a/website/web/helpers.py +++ b/website/web/helpers.py @@ -6,7 +6,7 @@ import hashlib import json import os import re -from functools import lru_cache, cache +from functools import lru_cache from pathlib import Path import flask_login # type: ignore[import-untyped] @@ -14,6 +14,7 @@ from flask import Request from werkzeug.security import generate_password_hash from lookyloo import Lookyloo, Indexing +from lookyloo.helpers import get_indexing as get_indexing_cache from lookyloo.default import get_config, get_homedir, LookylooException __global_lookyloo_instance = None @@ -118,18 +119,10 @@ def sri_load() -> dict[str, dict[str, str]]: return json.load(f) -@cache def get_indexing(user: User | None) -> Indexing: '''Depending if we're logged in or not, we (can) get different indexes: if index_everything is enabled, we have an index in kvrocks that contains all the indexes for all the captures. It is only accessible to the admin user. ''' - if not get_config('generic', 'index_everything'): - return Indexing() - - if not user or not user.is_authenticated: - # No user or anonymous - return Indexing() - # Logged in user - return Indexing(full_index=True) + return get_indexing_cache(full=bool(user and user.is_authenticated))