From b3a4f539b03f254258ba6ce9cb9b5bc374bd75d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Tue, 24 Sep 2024 15:40:40 +0200
Subject: [PATCH] new: TLDs indexing, new format for capture internals indexes

---
 lookyloo/capturecache.py  |  9 +++--
 lookyloo/helpers.py       |  4 +-
 lookyloo/indexing.py      | 81 +++++++++++++++++++++++++++++++++++++--
 lookyloo/lookyloo.py      | 10 +++++
 website/web/genericapi.py | 40 ++++++++++++++++++-
 website/web/helpers.py    | 13 ++-----
 6 files changed, 135 insertions(+), 22 deletions(-)

diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py
index 9ff76946..5f0179cc 100644
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@@ -28,8 +28,7 @@ from pyipasnhistory import IPASNHistory  # type: ignore[attr-defined]
 from redis import Redis
 
 from .context import Context
-from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree
-from .indexing import Indexing
+from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing
 from .default import LookylooException, try_make_file, get_config
 from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild
 from .modules import Cloudflare
@@ -119,7 +118,6 @@ class CapturesIndex(Mapping):  # type: ignore[type-arg]
         self.logger = logging.getLogger(f'{self.__class__.__name__}')
         self.logger.setLevel(get_config('generic', 'loglevel'))
         self.redis = redis
-        self.indexing = Indexing()
         self.contextualizer = contextualizer
         self.__cache_max_size = maxsize
         self.__cache: dict[str, CaptureCache] = OrderedDict()
@@ -363,7 +361,10 @@ class CapturesIndex(Mapping):  # type: ignore[type-arg]
             try:
                 logger.debug('The tree needs to be rebuilt.')
                 tree = self._create_pickle(capture_dir, logger)
-                self.indexing.force_reindex(uuid)
+                # Force the reindexing in the public and full index (if enabled)
+                get_indexing().force_reindex(uuid)
+                if get_config('generic', 'index_everything'):
+                    get_indexing(full=True).force_reindex(uuid)
             except NoValidHarFile:
                 logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
                 tree = None
diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py
index ed0a5c91..bae21219 100644
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@@ -446,9 +446,7 @@ def load_user_config(username: str) -> dict[str, Any] | None:
 @cache
 def get_indexing(full: bool=False) -> Indexing:
     from .indexing import Indexing
-    if not get_config('generic', 'index_everything'):
-        return Indexing()
-    if full:
+    if get_config('generic', 'index_everything') and full:
         return Indexing(full_index=True)
     return Indexing()
 
diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py
index 2238acfd..93979c94 100644
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@@ -5,9 +5,10 @@ from __future__ import annotations
 import base64
 import hashlib
 import logging
-# import re
+
 from io import BytesIO
 from collections import defaultdict
+from datetime import datetime, timedelta
 from urllib.parse import urlsplit
 from zipfile import ZipFile
 
@@ -76,13 +77,22 @@ class Indexing():
         p.srem('indexed_favicons', capture_uuid)
         p.srem('indexed_identifiers', capture_uuid)
         p.srem('indexed_categories', capture_uuid)
+        p.srem('indexed_tlds', capture_uuid)
         for identifier_type in self.identifiers_types():
             p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
         for hash_type in self.captures_hashes_types():
             p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
+        for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
+            # internal_index can be "tlds"
+            for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'):
+                # entry can be a "com", we delete a set of UUIDs, remove from the captures set
+                p.delete(f'capture_indexes|{capture_uuid}|{internal_index}|{entry}')
+                p.zrem(f'{internal_index}|{entry}|captures', capture_uuid)
+            p.delete(f'capture_indexes|{capture_uuid}|{internal_index}')
+        p.delete(f'capture_indexes|{capture_uuid}')
         p.execute()
 
-    def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]:
+    def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool]:
         p = self.redis.pipeline()
         p.sismember('indexed_urls', capture_uuid)
         p.sismember('indexed_body_hashes', capture_uuid)
@@ -91,11 +101,12 @@ class Indexing():
         p.sismember('indexed_favicons', capture_uuid)
         p.sismember('indexed_identifiers', capture_uuid)
         p.sismember('indexed_categories', capture_uuid)
+        p.sismember('indexed_tlds', capture_uuid)
         # We also need to check if the hash_type are all indexed for this capture
         hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
         to_return: list[bool] = p.execute()
         to_return.append(hash_types_indexed)
-        # This call for sure returns a tuple of 7 booleans
+        # This call for sure returns a tuple of 8 booleans
         return tuple(to_return)  # type: ignore[return-value]
 
     def index_capture(self, uuid_to_index: str, directory: Path) -> None:
@@ -145,6 +156,9 @@ class Indexing():
                 self.logger.info(f'Indexing categories for {uuid_to_index}')
                 self.index_categories_capture(uuid_to_index, directory)
             if not indexed[7]:
+                self.logger.info(f'Indexing TLDs for {uuid_to_index}')
+                self.index_tld_capture(ct)
+            if not indexed[8]:
                 self.logger.info(f'Indexing hash types for {uuid_to_index}')
                 self.index_capture_hashes_types(ct)
 
@@ -345,7 +359,7 @@ class Indexing():
             if 'hhhash' not in urlnode.features:
                 continue
             if urlnode.hhhash in already_loaded:
-                # Only add cookie name once / capture
+                # Only add HTTP header Hash once / capture
                 continue
             already_loaded.add(urlnode.hhhash)
             if urlnode.hhhash not in already_cleaned_up:
@@ -401,6 +415,65 @@ class Indexing():
     def get_captures_hostname(self, hostname: str) -> set[str]:
         return self.redis.smembers(f'hostnames|{hostname}|captures')
 
+    # ###### TLDs ######
+
+    @property
+    def tlds(self) -> set[str]:
+        return self.redis.smembers('tlds')
+
+    def index_tld_capture(self, crawled_tree: CrawledTree) -> None:
+        if self.redis.sismember('indexed_tlds', crawled_tree.uuid):
+            # Do not reindex
+            return
+        self.redis.sadd('indexed_tlds', crawled_tree.uuid)
+        self.logger.debug(f'Indexing TLDs for {crawled_tree.uuid} ... ')
+        pipeline = self.redis.pipeline()
+
+        # Add the tlds key in internal indexes set
+        internal_index = f'capture_indexes|{crawled_tree.uuid}'
+        pipeline.sadd(internal_index, 'tlds')
+
+        already_indexed_global: set[str] = set()
+        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
+            if not hasattr(urlnode, 'known_tld'):
+                # No TLD in the node.
+                continue
+            if urlnode.known_tld not in already_indexed_global:
+                # TLD hasn't been indexed in that run yet
+                already_indexed_global.add(urlnode.known_tld)
+                pipeline.sadd(f'{internal_index}|tlds', urlnode.known_tld)  # Only used to delete index
+                pipeline.sadd('tlds', urlnode.known_tld)
+                pipeline.zadd(f'tlds|{urlnode.known_tld}|captures',
+                              mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
+
+            # Add hostnode UUID in internal index
+            pipeline.sadd(f'{internal_index}|tlds|{urlnode.known_tld}', urlnode.uuid)
+
+        pipeline.execute()
+        self.logger.debug(f'done with TLDs for {crawled_tree.uuid}.')
+
+    def get_captures_tld(self, tld: str, most_recent_capture: datetime | None = None,
+                         oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
+        """Get all the captures for a specific TLD, on a time interval starting from the most recent one.
+
+        :param tld: The TLD
+        :param most_recent_capture: The capture time of the most recent capture to consider
+        :param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
+        """
+        max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
+        min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
+        return self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True)
+
+    def get_capture_tld_counter(self, capture_uuid: str, tld: str) -> int:
+        # NOTE: what to do when the capture isn't indexed yet? Raise an exception?
+        # For now, return 0
+        return self.redis.scard(f'capture_indexes|{capture_uuid}|tlds|{tld}')
+
+    def get_capture_tld_nodes(self, capture_uuid: str, tld: str) -> set[str]:
+        if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|tlds|{tld}'):
+            return set(url_nodes)
+        return set()
+
     # ###### favicons ######
 
     @property
diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py
index 78461dcd..5f8d7db3 100644
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@@ -223,11 +223,21 @@ class Lookyloo():
         ct = self.get_crawled_tree(capture_uuid)
         return ct.root_hartree.get_url_node_by_uuid(node_uuid)
 
+    def get_urlnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[URLNode]:
+        '''Get a list of URL nodes from a tree, by UUID'''
+        ct = self.get_crawled_tree(capture_uuid)
+        return [ct.root_hartree.get_url_node_by_uuid(node_uuid) for node_uuid in node_uuids]
+
     def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode:
         '''Get a host node from a tree, by UUID'''
         ct = self.get_crawled_tree(capture_uuid)
         return ct.root_hartree.get_host_node_by_uuid(node_uuid)
 
+    def get_hostnodes_from_tree(self, capture_uuid: str, /, node_uuids: Iterable[str]) -> list[HostNode]:
+        '''Get a list of host nodes from a tree, by UUID'''
+        ct = self.get_crawled_tree(capture_uuid)
+        return [ct.root_hartree.get_host_node_by_uuid(node_uuid) for node_uuid in node_uuids]
+
     def get_statistics(self, capture_uuid: str, /) -> dict[str, Any]:
         '''Get the statistics of a capture.'''
         ct = self.get_crawled_tree(capture_uuid)
diff --git a/website/web/genericapi.py b/website/web/genericapi.py
index 7d8a5e31..a565d6c4 100644
--- a/website/web/genericapi.py
+++ b/website/web/genericapi.py
@@ -6,6 +6,8 @@ import base64
 import gzip
 import hashlib
 import json
+import logging
+import logging.config
 
 from io import BytesIO
 from typing import Any
@@ -20,6 +22,7 @@ from werkzeug.security import check_password_hash
 from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError
 from pylacus import CaptureStatus as CaptureStatusPy
 from lookyloo import CaptureSettings, Lookyloo
+from lookyloo.default import get_config
 from lookyloo.comparator import Comparator
 from lookyloo.exceptions import MissingUUID, NoValidHarFile
 from lookyloo.helpers import load_user_config
@@ -31,6 +34,7 @@ api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')
 
 lookyloo: Lookyloo = get_lookyloo_instance()
 comparator: Comparator = Comparator()
+logging.config.dictConfig(get_config('logging'))
 
 
 def api_auth_check(method):  # type: ignore[no-untyped-def]
@@ -784,7 +788,7 @@ class CaptureHide(Resource):  # type: ignore[misc]
         except Exception as e:
             return {'error': f'Unable to hide the tree: {e}'}, 400
         return {'info': f'Capture {capture_uuid} successfully hidden.'}
-    
+
 
 @api.route('/admin/<string:capture_uuid>/remove')
 @api.doc(description='Remove the capture from the index.',
@@ -825,3 +829,37 @@ class CategoriesCaptures(Resource):  # type: ignore[misc]
             return list(get_indexing(flask_login.current_user).get_captures_category(category))
         return {c: list(get_indexing(flask_login.current_user).get_captures_category(c))
                 for c in existing_categories}
+
+
+# NOTE: there are a few extra paramaters we may want to add in the future: most recent/oldest capture
+@api.route('/json/tlds')
+@api.doc(description='Get captures with hits on a specific TLD, to TLD returns the a list of most frequent TLDs.')
+class TLDCaptures(Resource):  # type: ignore[misc]
+
+    @api.param('tld', 'Get captures with a specific TLD and their capture timestamp.')  # type: ignore[misc]
+    @api.param('urls_only', 'Returns recent URLs with that TLD, regardless the capture.')  # type: ignore[misc]
+    def get(self) -> list[tuple[str, float]] | list[str]:
+        tld: str | None = request.args['tld'] if request.args.get('tld') else None
+        urls_only: bool | None = True if request.args.get('urls_only') else None
+        if not tld:
+            return list(get_indexing(flask_login.current_user).tlds)
+        recent_captures_with_tld = get_indexing(flask_login.current_user).get_captures_tld(tld)
+        if not recent_captures_with_tld:
+            return []
+        if not urls_only:
+            return recent_captures_with_tld
+        # get the capture, get the node uuids, get the names, make it a list
+        to_return: set[str] = set()
+        # Make sure to only get the captures with a pickle ready
+        cache = lookyloo.sorted_capture_cache([uuid for uuid, _ in recent_captures_with_tld], cached_captures_only=True)
+        for c in cache:
+            uuid = c.uuid
+            nodes_with_tld = get_indexing(flask_login.current_user).get_capture_tld_nodes(uuid, tld)
+            try:
+                to_return.update(node.name for node in lookyloo.get_urlnodes_from_tree(uuid, nodes_with_tld))
+            except IndexError:
+                # The capture needs to be re-indexed
+                # NOTE: If this warning it printed on a loop for a capture, we have a problem with the index.
+                logging.warning(f'Capture {uuid} needs to be re-indexed.')
+                get_indexing(flask_login.current_user).force_reindex(uuid)
+        return list(to_return)
diff --git a/website/web/helpers.py b/website/web/helpers.py
index ba3841f2..7e0c6f24 100644
--- a/website/web/helpers.py
+++ b/website/web/helpers.py
@@ -6,7 +6,7 @@ import hashlib
 import json
 import os
 import re
-from functools import lru_cache, cache
+from functools import lru_cache
 from pathlib import Path
 
 import flask_login  # type: ignore[import-untyped]
@@ -14,6 +14,7 @@ from flask import Request
 from werkzeug.security import generate_password_hash
 
 from lookyloo import Lookyloo, Indexing
+from lookyloo.helpers import get_indexing as get_indexing_cache
 from lookyloo.default import get_config, get_homedir, LookylooException
 
 __global_lookyloo_instance = None
@@ -118,18 +119,10 @@ def sri_load() -> dict[str, dict[str, str]]:
         return json.load(f)
 
 
-@cache
 def get_indexing(user: User | None) -> Indexing:
     '''Depending if we're logged in or not, we (can) get different indexes:
         if index_everything is enabled, we have an index in kvrocks that contains all
         the indexes for all the captures.
         It is only accessible to the admin user.
     '''
-    if not get_config('generic', 'index_everything'):
-        return Indexing()
-
-    if not user or not user.is_authenticated:
-        # No user or anonymous
-        return Indexing()
-    # Logged in user
-    return Indexing(full_index=True)
+    return get_indexing_cache(full=bool(user and user.is_authenticated))