chg: quick and dirty category indexing

pull/926/head
Raphaël Vinot 2024-07-23 00:21:26 +02:00
parent abf5e0cccb
commit b20ddb4788
6 changed files with 64 additions and 33 deletions

View File

@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager):
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]:
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
@ -89,10 +89,12 @@ class BackgroundIndexer(AbstractManager):
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
if not indexed[6]:
self.logger.info(f'Indexing categories for {uuid_to_index}')
categories = self.lookyloo.categories_capture(uuid_to_index)
self.indexing.index_categories_capture(uuid_to_index, categories)
if not indexed[7]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.indexing.index_capture_hashes_types(ct)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
self.indexing.indexing_done()
self.logger.info('... done.')

View File

@ -47,7 +47,7 @@ class LookylooCacheLogAdapter(LoggerAdapter): # type: ignore[type-arg]
class CaptureCache():
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
'error', 'no_index', 'categories', 'parent',
'error', 'no_index', 'parent',
'user_agent', 'referer', 'logger')
def __init__(self, cache_entry: dict[str, Any]):
@ -89,7 +89,6 @@ class CaptureCache():
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
self.error: str | None = cache_entry.get('error')
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
self.categories: list[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else []
self.parent: str | None = cache_entry.get('parent')
self.user_agent: str | None = cache_entry.get('user_agent')
self.referer: str | None = cache_entry.get('referer')
@ -484,10 +483,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
and "No har files in" not in cache['error']):
logger.info(cache['error'])
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
if (capture_dir / 'no_index').exists():
# If the folders claims anonymity
cache['no_index'] = 1

View File

@ -8,7 +8,6 @@ import logging
# import re
from io import BytesIO
from collections import defaultdict
from typing import Iterable
from urllib.parse import urlsplit
from zipfile import ZipFile
@ -69,13 +68,14 @@ class Indexing():
p.srem('indexed_hhhashes', capture_uuid)
p.srem('indexed_favicons', capture_uuid)
p.srem('indexed_identifiers', capture_uuid)
p.srem('indexed_categories', capture_uuid)
for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
p.execute()
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]:
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
@ -83,6 +83,7 @@ class Indexing():
p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid)
p.sismember('indexed_identifiers', capture_uuid)
p.sismember('indexed_categories', capture_uuid)
# We also need to check if the hash_type are all indexed for this capture
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
to_return: list[bool] = p.execute()
@ -548,24 +549,34 @@ class Indexing():
# ###### Categories ######
@property
def categories(self) -> list[tuple[str, int]]:
return [(c, int(score))
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
def categories(self) -> set[str]:
return self.redis.smembers('categories')
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]) -> None:
if not categories:
return
def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex
return
self.redis.sadd('indexed_categories', capture_uuid)
if not categories:
return
added_in_existing_categories = set()
pipeline = self.redis.pipeline()
for category in categories:
pipeline.zincrby('categories', 1, category)
pipeline.sadd(category, capture_uuid)
for c in self.categories:
if c in capture_categories:
pipeline.sadd(c, capture_uuid)
added_in_existing_categories.add(c)
else:
# the capture is not in that category, srem is as cheap as exists if not in the set
pipeline.srem(c, capture_uuid)
# Handle the new categories
for new_c in set(capture_categories) - added_in_existing_categories:
pipeline.sadd(new_c, capture_uuid)
pipeline.sadd('categories', new_c)
pipeline.execute()
def get_captures_category(self, category: str) -> set[str]:
return self.redis.smembers(category)
def capture_in_category(self, capture_uuid: str, category: str) -> bool:
return self.redis.sismember(category, capture_uuid)
def reindex_categories_capture(self, capture_uuid: str) -> None:
self.redis.srem('indexed_categories', capture_uuid)

View File

@ -308,16 +308,15 @@ class Lookyloo():
return None
def categories_capture(self, capture_uuid: str, /) -> dict[str, Any]:
def categories_capture(self, capture_uuid: str, /) -> list[str]:
'''Get all the categories related to a capture, in MISP Taxonomies format'''
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
# get existing categories if possible
if categ_file.exists():
with categ_file.open() as f:
current_categories = [line.strip() for line in f.readlines()]
return [line.strip() for line in f.readlines()]
# return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
return {e: e for e in current_categories}
return {}
return []
def categorize_capture(self, capture_uuid: str, /, category: str) -> None:
'''Add a category (MISP Taxonomy tag) to a capture.'''

View File

@ -41,8 +41,10 @@ from werkzeug.wrappers.response import Response as WerkzeugResponse
from lookyloo import Lookyloo, CaptureSettings
from lookyloo.default import get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile, LacusUnreachable
from lookyloo.helpers import (get_taxonomies, UserAgents, load_cookies,
load_user_config)
from lookyloo.helpers import (UserAgents, load_cookies,
load_user_config,
get_taxonomies
)
if sys.version_info < (3, 9):
from pytz import all_timezones_set
@ -675,7 +677,7 @@ def historical_lookups(tree_uuid: str) -> str | WerkzeugResponse | Response:
def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | Response:
if not enable_categorization:
return redirect(url_for('tree', tree_uuid=tree_uuid))
matching_categories = None
matching_categories: dict[str, Any] = {}
if 'verification-status' in request.form:
status = request.form.get('verification-status')
# fast categories
@ -692,6 +694,7 @@ def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | R
categories.append(category)
for category in categories:
lookyloo.categorize_capture(tree_uuid, category)
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
if 'query' in request.form and request.form.get('query', '').strip():
matching_categories = {}
t = get_taxonomies()
@ -711,6 +714,7 @@ def uncategorize_capture(tree_uuid: str, category: str) -> str | WerkzeugRespons
if not enable_categorization:
return jsonify({'response': 'Categorization not enabled.'})
lookyloo.uncategorize_capture(tree_uuid, category)
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
return jsonify({'response': f'{category} successfully removed from {tree_uuid}'})
@ -721,6 +725,7 @@ def categorize_capture(tree_uuid: str, category: str) -> str | WerkzeugResponse
if not enable_categorization:
return jsonify({'response': 'Categorization not enabled.'})
lookyloo.categorize_capture(tree_uuid, category)
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
return jsonify({'response': f'{category} successfully added to {tree_uuid}'})
@ -1327,9 +1332,8 @@ def index_generic(show_hidden: bool=False, show_error: bool=True, category: str
if cut_time and cached.timestamp < cut_time_with_tz:
continue
if category:
if not cached.categories or category not in cached.categories:
continue
if category and not get_indexing(flask_login.current_user).capture_in_category(cached.uuid, category):
continue
if show_hidden:
# Only display the hidden ones
@ -1367,7 +1371,7 @@ def get_index_params(request: Request) -> tuple[bool, str]:
@app.route('/index', methods=['GET'])
def index() -> str:
show_error, category = get_index_params(request)
return index_generic(show_error=show_error)
return index_generic(show_error=show_error, category=category)
@app.route('/hidden', methods=['GET'])

View File

@ -14,6 +14,7 @@
</tr>
</thead>
<tbody>
{% if categories_info is mapping %}
{% for mt, val in categories_info.items() %}
<tr>
<td><a href="https://www.misp-project.org/taxonomies.html#_{{ val[0].name }}">{{ val[0].name }}</a></td>
@ -42,6 +43,25 @@
</td>
</tr>
{% endfor %}
{% else %}
{% for mt in categories_info %}
<tr>
<td></td>
<td>
</td>
<td>{{ mt }}</td>
<td>
<button type="button" class="btn btn-primary {% if add_category %}categorize_capture{% else %}uncategorize_capture{% endif %}" value="{{ mt }}">
{% if add_category %}
Categorize capture.
{% else %}
Uncategorize capture.
{% endif %}
</button>
</td>
</tr>
{% endfor %}
{% endif %}
</tbody>
</table>
</div>