mirror of https://github.com/CIRCL/lookyloo
chg: quick and dirty category indexing
parent
abf5e0cccb
commit
b20ddb4788
|
@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager):
|
|||
# Don't need the cache in this class.
|
||||
self.lookyloo.clear_tree_cache()
|
||||
|
||||
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]:
|
||||
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool, bool], str], None, None]:
|
||||
# NOTE: only get the non-archived captures for now.
|
||||
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
|
||||
if not self.full_indexer:
|
||||
|
@ -89,10 +89,12 @@ class BackgroundIndexer(AbstractManager):
|
|||
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
|
||||
self.indexing.index_identifiers_capture(ct)
|
||||
if not indexed[6]:
|
||||
self.logger.info(f'Indexing categories for {uuid_to_index}')
|
||||
categories = self.lookyloo.categories_capture(uuid_to_index)
|
||||
self.indexing.index_categories_capture(uuid_to_index, categories)
|
||||
if not indexed[7]:
|
||||
self.logger.info(f'Indexing hash types for {uuid_to_index}')
|
||||
self.indexing.index_capture_hashes_types(ct)
|
||||
# NOTE: categories aren't taken in account here, should be fixed(?)
|
||||
# see indexing.index_categories_capture(capture_uuid, categories)
|
||||
self.indexing.indexing_done()
|
||||
self.logger.info('... done.')
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ class LookylooCacheLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|||
|
||||
class CaptureCache():
|
||||
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
|
||||
'error', 'no_index', 'categories', 'parent',
|
||||
'error', 'no_index', 'parent',
|
||||
'user_agent', 'referer', 'logger')
|
||||
|
||||
def __init__(self, cache_entry: dict[str, Any]):
|
||||
|
@ -89,7 +89,6 @@ class CaptureCache():
|
|||
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
|
||||
self.error: str | None = cache_entry.get('error')
|
||||
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
|
||||
self.categories: list[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else []
|
||||
self.parent: str | None = cache_entry.get('parent')
|
||||
self.user_agent: str | None = cache_entry.get('user_agent')
|
||||
self.referer: str | None = cache_entry.get('referer')
|
||||
|
@ -484,10 +483,6 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
|
|||
and "No har files in" not in cache['error']):
|
||||
logger.info(cache['error'])
|
||||
|
||||
if (capture_dir / 'categories').exists():
|
||||
with (capture_dir / 'categories').open() as _categories:
|
||||
cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
|
||||
|
||||
if (capture_dir / 'no_index').exists():
|
||||
# If the folders claims anonymity
|
||||
cache['no_index'] = 1
|
||||
|
|
|
@ -8,7 +8,6 @@ import logging
|
|||
# import re
|
||||
from io import BytesIO
|
||||
from collections import defaultdict
|
||||
from typing import Iterable
|
||||
from urllib.parse import urlsplit
|
||||
from zipfile import ZipFile
|
||||
|
||||
|
@ -69,13 +68,14 @@ class Indexing():
|
|||
p.srem('indexed_hhhashes', capture_uuid)
|
||||
p.srem('indexed_favicons', capture_uuid)
|
||||
p.srem('indexed_identifiers', capture_uuid)
|
||||
p.srem('indexed_categories', capture_uuid)
|
||||
for identifier_type in self.identifiers_types():
|
||||
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
|
||||
for hash_type in self.captures_hashes_types():
|
||||
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
|
||||
p.execute()
|
||||
|
||||
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]:
|
||||
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool, bool]:
|
||||
p = self.redis.pipeline()
|
||||
p.sismember('indexed_urls', capture_uuid)
|
||||
p.sismember('indexed_body_hashes', capture_uuid)
|
||||
|
@ -83,6 +83,7 @@ class Indexing():
|
|||
p.sismember('indexed_hhhashes', capture_uuid)
|
||||
p.sismember('indexed_favicons', capture_uuid)
|
||||
p.sismember('indexed_identifiers', capture_uuid)
|
||||
p.sismember('indexed_categories', capture_uuid)
|
||||
# We also need to check if the hash_type are all indexed for this capture
|
||||
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
|
||||
to_return: list[bool] = p.execute()
|
||||
|
@ -548,24 +549,34 @@ class Indexing():
|
|||
# ###### Categories ######
|
||||
|
||||
@property
|
||||
def categories(self) -> list[tuple[str, int]]:
|
||||
return [(c, int(score))
|
||||
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
|
||||
def categories(self) -> set[str]:
|
||||
return self.redis.smembers('categories')
|
||||
|
||||
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]) -> None:
|
||||
if not categories:
|
||||
return
|
||||
def index_categories_capture(self, capture_uuid: str, capture_categories: list[str]) -> None:
|
||||
if self.redis.sismember('indexed_categories', capture_uuid):
|
||||
# do not reindex
|
||||
return
|
||||
self.redis.sadd('indexed_categories', capture_uuid)
|
||||
if not categories:
|
||||
return
|
||||
added_in_existing_categories = set()
|
||||
pipeline = self.redis.pipeline()
|
||||
for category in categories:
|
||||
pipeline.zincrby('categories', 1, category)
|
||||
pipeline.sadd(category, capture_uuid)
|
||||
for c in self.categories:
|
||||
if c in capture_categories:
|
||||
pipeline.sadd(c, capture_uuid)
|
||||
added_in_existing_categories.add(c)
|
||||
else:
|
||||
# the capture is not in that category, srem is as cheap as exists if not in the set
|
||||
pipeline.srem(c, capture_uuid)
|
||||
# Handle the new categories
|
||||
for new_c in set(capture_categories) - added_in_existing_categories:
|
||||
pipeline.sadd(new_c, capture_uuid)
|
||||
pipeline.sadd('categories', new_c)
|
||||
pipeline.execute()
|
||||
|
||||
def get_captures_category(self, category: str) -> set[str]:
|
||||
return self.redis.smembers(category)
|
||||
|
||||
def capture_in_category(self, capture_uuid: str, category: str) -> bool:
|
||||
return self.redis.sismember(category, capture_uuid)
|
||||
|
||||
def reindex_categories_capture(self, capture_uuid: str) -> None:
|
||||
self.redis.srem('indexed_categories', capture_uuid)
|
||||
|
|
|
@ -308,16 +308,15 @@ class Lookyloo():
|
|||
|
||||
return None
|
||||
|
||||
def categories_capture(self, capture_uuid: str, /) -> dict[str, Any]:
|
||||
def categories_capture(self, capture_uuid: str, /) -> list[str]:
|
||||
'''Get all the categories related to a capture, in MISP Taxonomies format'''
|
||||
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
|
||||
# get existing categories if possible
|
||||
if categ_file.exists():
|
||||
with categ_file.open() as f:
|
||||
current_categories = [line.strip() for line in f.readlines()]
|
||||
return [line.strip() for line in f.readlines()]
|
||||
# return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
|
||||
return {e: e for e in current_categories}
|
||||
return {}
|
||||
return []
|
||||
|
||||
def categorize_capture(self, capture_uuid: str, /, category: str) -> None:
|
||||
'''Add a category (MISP Taxonomy tag) to a capture.'''
|
||||
|
|
|
@ -41,8 +41,10 @@ from werkzeug.wrappers.response import Response as WerkzeugResponse
|
|||
from lookyloo import Lookyloo, CaptureSettings
|
||||
from lookyloo.default import get_config
|
||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile, LacusUnreachable
|
||||
from lookyloo.helpers import (get_taxonomies, UserAgents, load_cookies,
|
||||
load_user_config)
|
||||
from lookyloo.helpers import (UserAgents, load_cookies,
|
||||
load_user_config,
|
||||
get_taxonomies
|
||||
)
|
||||
|
||||
if sys.version_info < (3, 9):
|
||||
from pytz import all_timezones_set
|
||||
|
@ -675,7 +677,7 @@ def historical_lookups(tree_uuid: str) -> str | WerkzeugResponse | Response:
|
|||
def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | Response:
|
||||
if not enable_categorization:
|
||||
return redirect(url_for('tree', tree_uuid=tree_uuid))
|
||||
matching_categories = None
|
||||
matching_categories: dict[str, Any] = {}
|
||||
if 'verification-status' in request.form:
|
||||
status = request.form.get('verification-status')
|
||||
# fast categories
|
||||
|
@ -692,6 +694,7 @@ def categories_capture(tree_uuid: str, query: str) -> str | WerkzeugResponse | R
|
|||
categories.append(category)
|
||||
for category in categories:
|
||||
lookyloo.categorize_capture(tree_uuid, category)
|
||||
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
|
||||
if 'query' in request.form and request.form.get('query', '').strip():
|
||||
matching_categories = {}
|
||||
t = get_taxonomies()
|
||||
|
@ -711,6 +714,7 @@ def uncategorize_capture(tree_uuid: str, category: str) -> str | WerkzeugRespons
|
|||
if not enable_categorization:
|
||||
return jsonify({'response': 'Categorization not enabled.'})
|
||||
lookyloo.uncategorize_capture(tree_uuid, category)
|
||||
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
|
||||
return jsonify({'response': f'{category} successfully removed from {tree_uuid}'})
|
||||
|
||||
|
||||
|
@ -721,6 +725,7 @@ def categorize_capture(tree_uuid: str, category: str) -> str | WerkzeugResponse
|
|||
if not enable_categorization:
|
||||
return jsonify({'response': 'Categorization not enabled.'})
|
||||
lookyloo.categorize_capture(tree_uuid, category)
|
||||
get_indexing(flask_login.current_user).reindex_categories_capture(tree_uuid)
|
||||
return jsonify({'response': f'{category} successfully added to {tree_uuid}'})
|
||||
|
||||
|
||||
|
@ -1327,9 +1332,8 @@ def index_generic(show_hidden: bool=False, show_error: bool=True, category: str
|
|||
if cut_time and cached.timestamp < cut_time_with_tz:
|
||||
continue
|
||||
|
||||
if category:
|
||||
if not cached.categories or category not in cached.categories:
|
||||
continue
|
||||
if category and not get_indexing(flask_login.current_user).capture_in_category(cached.uuid, category):
|
||||
continue
|
||||
|
||||
if show_hidden:
|
||||
# Only display the hidden ones
|
||||
|
@ -1367,7 +1371,7 @@ def get_index_params(request: Request) -> tuple[bool, str]:
|
|||
@app.route('/index', methods=['GET'])
|
||||
def index() -> str:
|
||||
show_error, category = get_index_params(request)
|
||||
return index_generic(show_error=show_error)
|
||||
return index_generic(show_error=show_error, category=category)
|
||||
|
||||
|
||||
@app.route('/hidden', methods=['GET'])
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% if categories_info is mapping %}
|
||||
{% for mt, val in categories_info.items() %}
|
||||
<tr>
|
||||
<td><a href="https://www.misp-project.org/taxonomies.html#_{{ val[0].name }}">{{ val[0].name }}</a></td>
|
||||
|
@ -42,6 +43,25 @@
|
|||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
{% for mt in categories_info %}
|
||||
<tr>
|
||||
<td></td>
|
||||
<td>
|
||||
</td>
|
||||
<td>{{ mt }}</td>
|
||||
<td>
|
||||
<button type="button" class="btn btn-primary {% if add_category %}categorize_capture{% else %}uncategorize_capture{% endif %}" value="{{ mt }}">
|
||||
{% if add_category %}
|
||||
Categorize capture.
|
||||
{% else %}
|
||||
Uncategorize capture.
|
||||
{% endif %}
|
||||
</button>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
|
Loading…
Reference in New Issue