From ba05dc186d36d42b8566e0bc11ea9dc6a47a3977 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Mon, 4 Nov 2024 20:12:30 +0100
Subject: [PATCH] chg: Migrate categories to paginated index

---
 lookyloo/indexing.py      | 90 ++++++++++++++++++++++++++++-----------
 lookyloo/lookyloo.py      | 10 -----
 website/web/__init__.py   |  2 +-
 website/web/genericapi.py | 11 ++---
 4 files changed, 71 insertions(+), 42 deletions(-)

diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py
index 7c9f8968..cc2a5a91 100644
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 
 import hashlib
 import logging
+import re
 
 from datetime import datetime, timedelta
 
@@ -153,7 +154,7 @@ class Indexing():
                 self.index_identifiers_capture(ct)
             if not indexed[6]:
                 self.logger.info(f'Indexing categories for {uuid_to_index}')
-                self.index_categories_capture(uuid_to_index, directory)
+                self.index_categories_capture(ct, directory)
             if not indexed[7]:
                 self.logger.info(f'Indexing TLDs for {uuid_to_index}')
                 self.index_tld_capture(ct)
@@ -866,44 +867,85 @@ class Indexing():
 
     # ###### Categories ######
 
+    def _reindex_categories(self, category: str) -> None:
+        # the old format was adding the capture without a prefix, so we can use that to remove the old indexes
+        # the hardcoded categories only contained lowercase ascii and "-", ignore any other key
+        if not re.match(r'^[a-z-]+$', category):
+            return
+        if not self.redis.exists(category):
+            return
+        if self.redis.type(category) != 'set':  # type: ignore[no-untyped-call]
+            return
+        captures_to_reindex = self.redis.smembers(category)
+        pipeline = self.redis.pipeline()
+        pipeline.srem('indexed_categories', *captures_to_reindex)
+        pipeline.delete(category)
+        pipeline.execute()
+
     @property
     def categories(self) -> set[str]:
         return self.redis.smembers('categories')
 
-    def index_categories_capture(self, capture_uuid: str, capture_dir: Path) -> None:
-        if self.redis.sismember('indexed_categories', capture_uuid):
+    def index_categories_capture(self, crawled_tree: CrawledTree, capture_dir: Path) -> None:
+        if self.redis.sismember('indexed_categories', crawled_tree.uuid):
             # do not reindex
             return
-        # Make sure we don't reindex
-        self.redis.sadd('indexed_categories', capture_uuid)
+        self.redis.sadd('indexed_categories', crawled_tree.uuid)
+        self.logger.debug(f'Indexing captures for {crawled_tree.uuid} ... ')
+
+        internal_index = f'capture_indexes|{crawled_tree.uuid}'
+        check_if_exists = set()
+        # Remove all the old categories if any
+        pipeline = self.redis.pipeline()
+        for old_category in self.redis.smembers(f'{internal_index}|categories'):
+            self._reindex_categories(old_category)
+            pipeline.zrem(f'categories|{old_category}|captures', crawled_tree.uuid)
+            # after we run the pipeline, we can check if f'categories|{old_category}|captures' exists
+            # and remove old_category from the existing categories
+            check_if_exists.add(old_category)
+        pipeline.delete(f'{internal_index}|categories')
 
         categ_file = capture_dir / 'categories'
-        if categ_file.exists():
-            with categ_file.open('r') as f:
-                capture_categories = [c.strip() for c in f.readlines()]
-        else:
+        if not categ_file.exists():
+            pipeline.execute()
             return
 
-        added_in_existing_categories = set()
+        with categ_file.open('r') as f:
+            capture_categories = [c.strip() for c in f.readlines()]
+
+        for c in capture_categories:
+            pipeline.sadd('categories', c)
+            pipeline.sadd(f'{internal_index}|categories', c)
+            pipeline.zadd(f'categories|{c}|captures',
+                          mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
+
+        pipeline.execute()
         pipeline = self.redis.pipeline()
-        for c in self.categories:
-            if c in capture_categories:
-                pipeline.sadd(c, capture_uuid)
-                added_in_existing_categories.add(c)
-            else:
-                # the capture is not in that category, srem is as cheap as exists if not in the set
-                pipeline.srem(c, capture_uuid)
-        # Handle the new categories
-        for new_c in set(capture_categories) - added_in_existing_categories:
-            pipeline.sadd(new_c, capture_uuid)
-            pipeline.sadd('categories', new_c)
+        for c in check_if_exists:
+            if not self.redis.exists(f'categories|{c}|captures'):
+                pipeline.srem('categories', c)
         pipeline.execute()
 
-    def get_captures_category(self, category: str) -> set[str]:
-        return self.redis.smembers(category)
+    def get_captures_category(self, category: str, most_recent_capture: datetime | None=None,
+                              oldest_capture: datetime | None = None) -> list[tuple[str, float]]:
+        """Get all the captures for a specific category, on a time interval starting from the most recent one.
+
+        :param category: The category
+        :param most_recent_capture: The capture time of the most recent capture to consider
+        :param oldest_capture: The capture time of the oldest capture to consider, defaults to 30 days ago.
+        """
+        max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
+        min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=30)).timestamp()
+        return self.redis.zrevrangebyscore(f'categories|{category}|captures', max_score, min_score, withscores=True)
+
+    def get_capture_categories(self, capture_uuid: str) -> set[str]:
+        return self.redis.smembers(f'capture_indexes|{capture_uuid}|categories')
+
+    def get_captures_category_count(self, category: str) -> int:
+        return self.redis.zcard(f'categories|{category}|captures')
 
     def capture_in_category(self, capture_uuid: str, category: str) -> bool:
-        return self.redis.sismember(category, capture_uuid)
+        return self.redis.zscore(f'categories|{category}|captures', capture_uuid) is not None
 
     def reindex_categories_capture(self, capture_uuid: str) -> None:
         self.redis.srem('indexed_categories', capture_uuid)
diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py
index c3601de2..d469f8fd 100644
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@@ -324,16 +324,6 @@ class Lookyloo():
 
         return None
 
-    def categories_capture(self, capture_uuid: str, /) -> list[str]:
-        '''Get all the categories related to a capture, in MISP Taxonomies format'''
-        categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
-        # get existing categories if possible
-        if categ_file.exists():
-            with categ_file.open() as f:
-                return [line.strip() for line in f.readlines()]
-            # return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
-        return []
-
     def categorize_capture(self, capture_uuid: str, /, category: str) -> None:
         '''Add a category (MISP Taxonomy tag) to a capture.'''
         if not get_config('generic', 'enable_categorization'):
diff --git a/website/web/__init__.py b/website/web/__init__.py
index d7f3decc..7d50fb51 100644
--- a/website/web/__init__.py
+++ b/website/web/__init__.py
@@ -707,7 +707,7 @@ def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | R
         entries = t.search(query)
         if entries:
             matching_categories = {e: t.revert_machinetag(e) for e in entries}
-    current_categories = lookyloo.categories_capture(tree_uuid)
+    current_categories = get_indexing(flask_login.current_user).get_capture_categories(tree_uuid)
     return render_template('categories_capture.html', tree_uuid=tree_uuid,
                            current_categories=current_categories,
                            matching_categories=matching_categories)
diff --git a/website/web/genericapi.py b/website/web/genericapi.py
index 0e36bbcb..054a601e 100644
--- a/website/web/genericapi.py
+++ b/website/web/genericapi.py
@@ -831,14 +831,11 @@ class RecentCaptures(Resource):  # type: ignore[misc]
          params={'category': 'The category according to which the uuids are to be returned.'},
          required=False)
 class CategoriesCaptures(Resource):  # type: ignore[misc]
-    def get(self, category: str | None=None) -> list[str] | dict[str, list[str]] | tuple[dict[str, str], int]:
-        existing_categories = get_indexing(flask_login.current_user).categories
+    def get(self, category: str | None=None) -> list[str] | dict[str, list[str]]:
         if category:
-            if category not in existing_categories:
-                return {'error': f'Invalid category: {category}, must be in {", ".join(existing_categories)}.'}, 400
-            return list(get_indexing(flask_login.current_user).get_captures_category(category))
-        return {c: list(get_indexing(flask_login.current_user).get_captures_category(c))
-                for c in existing_categories}
+            return [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_category(category)]
+        return {c: [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_category(c)]
+                for c in get_indexing(flask_login.current_user).categories}
 
 
 # NOTE: there are a few extra paramaters we may want to add in the future: most recent/oldest capture