chg: quick and dirty category indexing

pull/926/head
Raphaël Vinot 2024-07-23 00:21:26 +02:00
parent abf5e0cccb
commit b20ddb4788
6 changed files with 64 additions and 33 deletions

View File

@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager):
# Don't need the cache in this class. # Don't need the cache in this class.
self.lookyloo.clear_tree_cache() self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]: def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now. # NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'): for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer: if not self.full_indexer:
@ -89,10 +89,12 @@ class BackgroundIndexer(AbstractManager):
self.logger.info(f'Indexing identifiers for {uuid_to_index}') self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct) self.indexing.index_identifiers_capture(ct)
if not indexed[6]: if not indexed[6]:
self.logger.info(f'Indexing categories for {uuid_to_index}')
categories = self.lookyloo.categories_capture(uuid_to_index)
self.indexing.index_categories_capture(uuid_to_index, categories)
if not indexed[7]:
self.logger.info(f'Indexing hash types for {uuid_to_index}') self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.indexing.index_capture_hashes_types(ct) self.indexing.index_capture_hashes_types(ct)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
self.indexing.indexing_done() self.indexing.indexing_done()
self.logger.info('... done.') self.logger.info('... done.')

View File

@ -47,7 +47,7 @@ class LookylooCacheLogAdapter(LoggerAdapter): # type: ignore[type-arg]
class CaptureCache(): class CaptureCache():
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir', __slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
'error', 'no_index', 'categories', 'parent', 'error', 'no_index', 'parent',
'user_agent', 'referer', 'logger') 'user_agent', 'referer', 'logger')
def __init__(self, cache_entry: dict[str, Any]): def __init__(self, cache_entry: dict[str, Any]):
@ -89,7 +89,6 @@ class CaptureCache():
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along # if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
self.error: str | None = cache_entry.get('error') self.error: str | None = cache_entry.get('error')
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
self.categories: list[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else []
self.parent: str | None = cache_entry.get('parent') self.parent: str | None = cache_entry.get('parent')
self.user_agent: str | None = cache_entry.get('user_agent') self.user_agent: str | None = cache_entry.get('user_agent')
self.referer: str | None = cache_entry.get('referer') self.referer: str | None = cache_entry.get('referer')
@ -484,10 +483,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
and "No har files in" not in cache['error']): and "No har files in" not in cache['error']):
logger.info(cache['error']) logger.info(cache['error'])
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
if (capture_dir / 'no_index').exists(): if (capture_dir / 'no_index').exists():
# If the folders claims anonymity # If the folders claims anonymity
cache['no_index'] = 1 cache['no_index'] = 1

View File

@ -8,7 +8,6 @@ import logging
# import re # import re
from io import BytesIO from io import BytesIO
from collections import defaultdict from collections import defaultdict
from typing import Iterable
from urllib.parse import urlsplit from urllib.parse import urlsplit
from zipfile import ZipFile from zipfile import ZipFile
@ -69,13 +68,14 @@ class Indexing():
p.srem('indexed_hhhashes', capture_uuid) p.srem('indexed_hhhashes', capture_uuid)
p.srem('indexed_favicons', capture_uuid) p.srem('indexed_favicons', capture_uuid)
p.srem('indexed_identifiers', capture_uuid) p.srem('indexed_identifiers', capture_uuid)
p.srem('indexed_categories', capture_uuid)
for identifier_type in self.identifiers_types(): for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types(): for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
p.execute() p.execute()
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]: def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline() p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid) p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid) p.sismember('indexed_body_hashes', capture_uuid)
@ -83,6 +83,7 @@ class Indexing():
p.sismember('indexed_hhhashes', capture_uuid) p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid) p.sismember('indexed_favicons', capture_uuid)
p.sismember('indexed_identifiers', capture_uuid) p.sismember('indexed_identifiers', capture_uuid)
p.sismember('indexed_categories', capture_uuid)
# We also need to check if the hash_type are all indexed for this capture # We also need to check if the hash_type are all indexed for this capture
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types()) hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
to_return: list[bool] = p.execute() to_return: list[bool] = p.execute()
@ -548,24 +549,34 @@ class Indexing():
# ###### Categories ###### # ###### Categories ######
@property @property
def categories(self) -> list[tuple[str, int]]: def categories(self) -> set[str]:
return [(c, int(score)) return self.redis.smembers('categories')
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]) -> None: def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
if not categories:
return
if self.redis.sismember('indexed_categories', capture_uuid): if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex # do not reindex
return return
self.redis.sadd('indexed_categories', capture_uuid) self.redis.sadd('indexed_categories', capture_uuid)
if not categories: added_in_existing_categories = set()
return
pipeline = self.redis.pipeline() pipeline = self.redis.pipeline()
for category in categories: for c in self.categories:
pipeline.zincrby('categories', 1, category) if c in capture_categories:
pipeline.sadd(category, capture_uuid) pipeline.sadd(c, capture_uuid)
added_in_existing_categories.add(c)
else:
# the capture is not in that category, srem is as cheap as exists if not in the set
pipeline.srem(c, capture_uuid)
# Handle the new categories
for new_c in set(capture_categories) - added_in_existing_categories:
pipeline.sadd(new_c, capture_uuid)
pipeline.sadd('categories', new_c)
pipeline.execute() pipeline.execute()
def get_captures_category(self, category: str) -> set[str]: def get_captures_category(self, category: str) -> set[str]:
return self.redis.smembers(category) return self.redis.smembers(category)
def capture_in_category(self, capture_uuid: str, category: str) -> bool:
return self.redis.sismember(category, capture_uuid)
def reindex_categories_capture(self, capture_uuid: str) -> None:
self.redis.srem('indexed_categories', capture_uuid)

View File

@ -308,16 +308,15 @@ class Lookyloo():
return None return None
def categories_capture(self, capture_uuid: str, /) -> dict[str, Any]: def categories_capture(self, capture_uuid: str, /) -> list[str]:
'''Get all the categories related to a capture, in MISP Taxonomies format''' '''Get all the categories related to a capture, in MISP Taxonomies format'''
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories' categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
# get existing categories if possible # get existing categories if possible
if categ_file.exists(): if categ_file.exists():
with categ_file.open() as f: with categ_file.open() as f:
current_categories = [line.strip() for line in f.readlines()] return [line.strip() for line in f.readlines()]
# return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} # return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
return {e: e for e in current_categories} return []
return {}
def categorize_capture(self, capture_uuid: str, /, category: str) -> None: def categorize_capture(self, capture_uuid: str, /, category: str) -> None:
'''Add a category (MISP Taxonomy tag) to a capture.''' '''Add a category (MISP Taxonomy tag) to a capture.'''

View File

@ -41,8 +41,10 @@ from werkzeug.wrappers.response import Response as WerkzeugResponse
from lookyloo import Lookyloo, CaptureSettings from lookyloo import Lookyloo, CaptureSettings
from lookyloo.default import get_config from lookyloo.default import get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile, LacusUnreachable from lookyloo.exceptions import MissingUUID, NoValidHarFile, LacusUnreachable
from lookyloo.helpers import (get_taxonomies, UserAgents, load_cookies, from lookyloo.helpers import (UserAgents, load_cookies,
load_user_config) load_user_config,
get_taxonomies
)
if sys.version_info < (3, 9): if sys.version_info < (3, 9):
from pytz import all_timezones_set from pytz import all_timezones_set
@ -675,7 +677,7 @@ def historical_lookups(tree_uuid: str) -> str | WerkzeugResponse | Response:
def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | Response: def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | Response:
if not enable_categorization: if not enable_categorization:
return redirect(url_for('tree', tree_uuid=tree_uuid)) return redirect(url_for('tree', tree_uuid=tree_uuid))
matching_categories = None matching_categories: dict[str, Any] = {}
if 'verification-status' in request.form: if 'verification-status' in request.form:
status = request.form.get('verification-status') status = request.form.get('verification-status')
# fast categories # fast categories
@ -692,6 +694,7 @@ def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | R
categories.append(category) categories.append(category)
for category in categories: for category in categories:
lookyloo.categorize_capture(tree_uuid, category) lookyloo.categorize_capture(tree_uuid, category)
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
if 'query' in request.form and request.form.get('query', '').strip(): if 'query' in request.form and request.form.get('query', '').strip():
matching_categories = {} matching_categories = {}
t = get_taxonomies() t = get_taxonomies()
@ -711,6 +714,7 @@ def uncategorize_capture(tree_uuid: str, category: str) -> str | WerkzeugRespons
if not enable_categorization: if not enable_categorization:
return jsonify({'response': 'Categorization not enabled.'}) return jsonify({'response': 'Categorization not enabled.'})
lookyloo.uncategorize_capture(tree_uuid, category) lookyloo.uncategorize_capture(tree_uuid, category)
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
return jsonify({'response': f'{category} successfully removed from {tree_uuid}'}) return jsonify({'response': f'{category} successfully removed from {tree_uuid}'})
@ -721,6 +725,7 @@ def categorize_capture(tree_uuid: str, category: str) -> str | WerkzeugResponse
if not enable_categorization: if not enable_categorization:
return jsonify({'response': 'Categorization not enabled.'}) return jsonify({'response': 'Categorization not enabled.'})
lookyloo.categorize_capture(tree_uuid, category) lookyloo.categorize_capture(tree_uuid, category)
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
return jsonify({'response': f'{category} successfully added to {tree_uuid}'}) return jsonify({'response': f'{category} successfully added to {tree_uuid}'})
@ -1327,9 +1332,8 @@ def index_generic(show_hidden: bool=False, show_error: bool=True, category: str
if cut_time and cached.timestamp < cut_time_with_tz: if cut_time and cached.timestamp < cut_time_with_tz:
continue continue
if category: if category and not get_indexing(flask_login.current_user).capture_in_category(cached.uuid, category):
if not cached.categories or category not in cached.categories: continue
continue
if show_hidden: if show_hidden:
# Only display the hidden ones # Only display the hidden ones
@ -1367,7 +1371,7 @@ def get_index_params(request: Request) -> tuple[bool, str]:
@app.route('/index', methods=['GET']) @app.route('/index', methods=['GET'])
def index() -> str: def index() -> str:
show_error, category = get_index_params(request) show_error, category = get_index_params(request)
return index_generic(show_error=show_error) return index_generic(show_error=show_error, category=category)
@app.route('/hidden', methods=['GET']) @app.route('/hidden', methods=['GET'])

View File

@ -14,6 +14,7 @@
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% if categories_info is mapping %}
{% for mt, val in categories_info.items() %} {% for mt, val in categories_info.items() %}
<tr> <tr>
<td><a href="https://www.misp-project.org/taxonomies.html#_{{ val[0].name }}">{{ val[0].name }}</a></td> <td><a href="https://www.misp-project.org/taxonomies.html#_{{ val[0].name }}">{{ val[0].name }}</a></td>
@ -42,6 +43,25 @@
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}
{% else %}
{% for mt in categories_info %}
<tr>
<td></td>
<td>
</td>
<td>{{ mt }}</td>
<td>
<button type="button" class="btn btn-primary {% if add_category %}categorize_capture{% else %}uncategorize_capture{% endif %}" value="{{ mt }}">
{% if add_category %}
Categorize capture.
{% else %}
Uncategorize capture.
{% endif %}
</button>
</td>
</tr>
{% endfor %}
{% endif %}
</tbody> </tbody>
</table> </table>
</div> </div>