diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 6cd09498..3a706816 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager): # Don't need the cache in this class. self.lookyloo.clear_tree_cache() - def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]: + def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]: # NOTE: only get the non-archived captures for now. for uuid, directory in self.redis.hscan_iter('lookup_dirs'): if not self.full_indexer: @@ -89,10 +89,12 @@ class BackgroundIndexer(AbstractManager): self.logger.info(f'Indexing identifiers for {uuid_to_index}') self.indexing.index_identifiers_capture(ct) if not indexed[6]: + self.logger.info(f'Indexing categories for {uuid_to_index}') + categories = self.lookyloo.categories_capture(uuid_to_index) + self.indexing.index_categories_capture(uuid_to_index, categories) + if not indexed[7]: self.logger.info(f'Indexing hash types for {uuid_to_index}') self.indexing.index_capture_hashes_types(ct) - # NOTE: categories aren't taken in account here, should be fixed(?) - # see indexing.index_categories_capture(capture_uuid, categories) self.indexing.indexing_done() self.logger.info('... done.') diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index f3cfcbad..7f257f0c 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -47,7 +47,7 @@ class LookylooCacheLogAdapter(LoggerAdapter): # type: ignore[type-arg] class CaptureCache(): __slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir', - 'error', 'no_index', 'categories', 'parent', + 'error', 'no_index', 'parent', 'user_agent', 'referer', 'logger') def __init__(self, cache_entry: dict[str, Any]): @@ -89,7 +89,6 @@ class CaptureCache(): # if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along self.error: str | None = cache_entry.get('error') self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False - self.categories: list[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else [] self.parent: str | None = cache_entry.get('parent') self.user_agent: str | None = cache_entry.get('user_agent') self.referer: str | None = cache_entry.get('referer') @@ -484,10 +483,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg] and "No har files in" not in cache['error']): logger.info(cache['error']) - if (capture_dir / 'categories').exists(): - with (capture_dir / 'categories').open() as _categories: - cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()]) - if (capture_dir / 'no_index').exists(): # If the folders claims anonymity cache['no_index'] = 1 diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 6ddcdc88..a81ad0bc 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -8,7 +8,6 @@ import logging # import re from io import BytesIO from collections import defaultdict -from typing import Iterable from urllib.parse import urlsplit from zipfile import ZipFile @@ -69,13 +68,14 @@ class Indexing(): p.srem('indexed_hhhashes', capture_uuid) p.srem('indexed_favicons', capture_uuid) p.srem('indexed_identifiers', capture_uuid) + p.srem('indexed_categories', capture_uuid) for identifier_type in self.identifiers_types(): p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) for hash_type in self.captures_hashes_types(): p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) p.execute() - def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]: + def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]: p = self.redis.pipeline() p.sismember('indexed_urls', capture_uuid) p.sismember('indexed_body_hashes', capture_uuid) @@ -83,6 +83,7 @@ class Indexing(): p.sismember('indexed_hhhashes', capture_uuid) p.sismember('indexed_favicons', capture_uuid) p.sismember('indexed_identifiers', capture_uuid) + p.sismember('indexed_categories', capture_uuid) # We also need to check if the hash_type are all indexed for this capture hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types()) to_return: list[bool] = p.execute() @@ -548,24 +549,34 @@ class Indexing(): # ###### Categories ###### @property - def categories(self) -> list[tuple[str, int]]: - return [(c, int(score)) - for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)] + def categories(self) -> set[str]: + return self.redis.smembers('categories') - def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]) -> None: - if not categories: - return + def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None: if self.redis.sismember('indexed_categories', capture_uuid): # do not reindex return self.redis.sadd('indexed_categories', capture_uuid) - if not categories: - return + added_in_existing_categories = set() pipeline = self.redis.pipeline() - for category in categories: - pipeline.zincrby('categories', 1, category) - pipeline.sadd(category, capture_uuid) + for c in self.categories: + if c in capture_categories: + pipeline.sadd(c, capture_uuid) + added_in_existing_categories.add(c) + else: + # the capture is not in that category, srem is as cheap as exists if not in the set + pipeline.srem(c, capture_uuid) + # Handle the new categories + for new_c in set(capture_categories) - added_in_existing_categories: + pipeline.sadd(new_c, capture_uuid) + pipeline.sadd('categories', new_c) pipeline.execute() def get_captures_category(self, category: str) -> set[str]: return self.redis.smembers(category) + + def capture_in_category(self, capture_uuid: str, category: str) -> bool: + return self.redis.sismember(category, capture_uuid) + + def reindex_categories_capture(self, capture_uuid: str) -> None: + self.redis.srem('indexed_categories', capture_uuid) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 0e2efa6d..155c926d 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -308,16 +308,15 @@ class Lookyloo(): return None - def categories_capture(self, capture_uuid: str, /) -> dict[str, Any]: + def categories_capture(self, capture_uuid: str, /) -> list[str]: '''Get all the categories related to a capture, in MISP Taxonomies format''' categ_file = self._captures_index[capture_uuid].capture_dir / 'categories' # get existing categories if possible if categ_file.exists(): with categ_file.open() as f: - current_categories = [line.strip() for line in f.readlines()] + return [line.strip() for line in f.readlines()] # return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} - return {e: e for e in current_categories} - return {} + return [] def categorize_capture(self, capture_uuid: str, /, category: str) -> None: '''Add a category (MISP Taxonomy tag) to a capture.''' diff --git a/website/web/__init__.py b/website/web/__init__.py index c991e3a0..21186dab 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -41,8 +41,10 @@ from werkzeug.wrappers.response import Response as WerkzeugResponse from lookyloo import Lookyloo, CaptureSettings from lookyloo.default import get_config from lookyloo.exceptions import MissingUUID, NoValidHarFile, LacusUnreachable -from lookyloo.helpers import (get_taxonomies, UserAgents, load_cookies, - load_user_config) +from lookyloo.helpers import (UserAgents, load_cookies, + load_user_config, + get_taxonomies + ) if sys.version_info < (3, 9): from pytz import all_timezones_set @@ -675,7 +677,7 @@ def historical_lookups(tree_uuid: str) -> str | WerkzeugResponse | Response: def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | Response: if not enable_categorization: return redirect(url_for('tree', tree_uuid=tree_uuid)) - matching_categories = None + matching_categories: dict[str, Any] = {} if 'verification-status' in request.form: status = request.form.get('verification-status') # fast categories @@ -692,6 +694,7 @@ def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | R categories.append(category) for category in categories: lookyloo.categorize_capture(tree_uuid, category) + get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid) if 'query' in request.form and request.form.get('query', '').strip(): matching_categories = {} t = get_taxonomies() @@ -711,6 +714,7 @@ def uncategorize_capture(tree_uuid: str, category: str) -> str | WerkzeugRespons if not enable_categorization: return jsonify({'response': 'Categorization not enabled.'}) lookyloo.uncategorize_capture(tree_uuid, category) + get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid) return jsonify({'response': f'{category} successfully removed from {tree_uuid}'}) @@ -721,6 +725,7 @@ def categorize_capture(tree_uuid: str, category: str) -> str | WerkzeugResponse if not enable_categorization: return jsonify({'response': 'Categorization not enabled.'}) lookyloo.categorize_capture(tree_uuid, category) + get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid) return jsonify({'response': f'{category} successfully added to {tree_uuid}'}) @@ -1327,9 +1332,8 @@ def index_generic(show_hidden: bool=False, show_error: bool=True, category: str if cut_time and cached.timestamp < cut_time_with_tz: continue - if category: - if not cached.categories or category not in cached.categories: - continue + if category and not get_indexing(flask_login.current_user).capture_in_category(cached.uuid, category): + continue if show_hidden: # Only display the hidden ones @@ -1367,7 +1371,7 @@ def get_index_params(request: Request) -> tuple[bool, str]: @app.route('/index', methods=['GET']) def index() -> str: show_error, category = get_index_params(request) - return index_generic(show_error=show_error) + return index_generic(show_error=show_error, category=category) @app.route('/hidden', methods=['GET']) diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html index 0e776190..32bfb40e 100644 --- a/website/web/templates/macros.html +++ b/website/web/templates/macros.html @@ -14,6 +14,7 @@
+ {% if categories_info is mapping %} {% for mt, val in categories_info.items() %}