diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 246cc744..96cab5de 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -170,21 +170,24 @@ class Indexing(): # ###### Cookies ###### + def _reindex_cookies(self, cookie_name: str) -> None: + # We changed the format of the indexes, so we need to make sure they're re-triggered. + pipeline = self.redis.pipeline() + if self.redis.type(f'cn|{cookie_name}|captures') == 'set': # type: ignore[no-untyped-call] + pipeline.srem('indexed_cookies', *[entry.split('|')[0] for entry in self.redis.smembers(f'cn|{cookie_name}|captures')]) + pipeline.delete(f'cn|{cookie_name}|captures') + if self.redis.type(f'cn|{cookie_name}') == 'zset': # type: ignore[no-untyped-call] + for domain in self.redis.zrevrangebyscore(f'cn|{cookie_name}', '+inf', '-inf'): + pipeline.delete(f'cn|{cookie_name}|{domain}') + pipeline.delete(domain) + pipeline.delete(f'cn|{cookie_name}') + if self.redis.type('cookies_names') == 'zset': # type: ignore[no-untyped-call] + pipeline.delete('cookies_names') + pipeline.execute() + @property - def cookies_names(self) -> list[tuple[str, float]]: - return self.redis.zrevrange('cookies_names', 0, -1, withscores=True) - - def cookies_names_number_domains(self, cookie_name: str) -> int: - return self.redis.zcard(f'cn|{cookie_name}') - - def cookies_names_domains_values(self, cookie_name: str, domain: str) -> list[tuple[str, float]]: - return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True) - - def get_cookie_domains(self, cookie_name: str) -> list[tuple[str, float]]: - return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True) - - def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]: - return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] # type: ignore[misc] + def cookies_names(self) -> set[str]: + return self.redis.smembers('cookies_names') def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_cookies', crawled_tree.uuid): @@ -192,39 +195,62 @@ class Indexing(): return self.logger.debug(f'Indexing cookies for {crawled_tree.uuid} ... ') self.redis.sadd('indexed_cookies', crawled_tree.uuid) - pipeline = self.redis.pipeline() - already_loaded: set[tuple[str, str]] = set() - # used if we need to reindex a capture - already_cleaned_up: set[str] = set() - is_reindex = False + + # Add the cookies_names key in internal indexes set + internal_index = f'capture_indexes|{crawled_tree.uuid}' + pipeline.sadd(internal_index, 'cookies_names') + + already_indexed_global: set[str] = set() for urlnode in crawled_tree.root_hartree.url_tree.traverse(): if 'cookies_received' not in urlnode.features: continue for domain, cookie, _ in urlnode.cookies_received: name, value = cookie.split('=', 1) - if (name, domain) in already_loaded: - # Only add cookie name once / capture - continue - already_loaded.add((name, domain)) - if name not in already_cleaned_up: - # We only run this srem once per name for a capture, - # before adding it for the first time - to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')] - if to_remove: - pipeline.srem(f'cn|{name}|captures', *to_remove) - is_reindex = True - self.logger.debug(f'reindexing cookies for {crawled_tree.uuid} ... ') - already_cleaned_up.add(name) - pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') - if not is_reindex: - pipeline.zincrby('cookies_names', 1, name) - pipeline.zincrby(f'cn|{name}', 1, domain) - pipeline.zincrby(f'cn|{name}|{domain}', 1, value) - pipeline.sadd(domain, name) + self._reindex_cookies(name) + if name not in already_indexed_global: + # The cookie hasn't been indexed in that run yet + already_indexed_global.add(name) + pipeline.sadd(f'{internal_index}|cookies_names', name) + pipeline.sadd('cookies_names', name) + pipeline.zadd(f'cookies_names|{name}|captures', + mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()}) + + # Add hostnode UUID in internal index + pipeline.sadd(f'{internal_index}|cookies_names|{name}', urlnode.uuid) pipeline.execute() self.logger.debug(f'done with cookies for {crawled_tree.uuid}.') + def get_captures_cookies_name(self, cookie_name: str, most_recent_capture: datetime | None = None, + oldest_capture: datetime | None= None) -> list[tuple[str, float]]: + """Get all the captures for a specific cookie name, on a time interval starting from the most recent one. + + :param cookie_name: The cookie name + :param most_recent_capture: The capture time of the most recent capture to consider + :param oldest_capture: The capture time of the oldest capture to consider, defaults to 15 days ago. + """ + max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf' + min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp() + if self.redis.type(f'cookies_names|{cookie_name}|captures') == 'set': # type: ignore[no-untyped-call] + # triggers the re-index soon. + self.redis.srem('indexed_cookies', *[entry.split('|')[0] for entry in self.redis.smembers(f'cn|{cookie_name}|captures')]) + return [] + return self.redis.zrevrangebyscore(f'cookies_names|{cookie_name}|captures', max_score, min_score, withscores=True) + + def get_captures_cookie_name_count(self, cookie_name: str) -> int: + return self.redis.zcard(f'cookies_names|{cookie_name}|captures') + + def get_capture_cookie_name_nodes(self, capture_uuid: str, cookie_name: str) -> set[str]: + if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|cookies_names|{cookie_name}'): + return set(url_nodes) + return set() + + def cookies_names_domains_values(self, cookie_name: str, domain: str) -> list[tuple[str, float]]: + return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True) + + def get_cookie_domains(self, cookie_name: str) -> list[tuple[str, float]]: + return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True) + # ###### Body hashes ###### def _reindex_ressources(self, h: str) -> None: @@ -381,7 +407,7 @@ class Indexing(): min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp() if self.redis.type(f'hhhashes|{hhh}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. - self.redis.srem('indexed_urls', *self.redis.smembers(f'hhhashes|{hhh}|captures')) + self.redis.srem('indexed_hhhashes', *self.redis.smembers(f'hhhashes|{hhh}|captures')) return [] return self.redis.zrevrangebyscore(f'hhhashes|{hhh}|captures', max_score, min_score, withscores=True) diff --git a/website/web/__init__.py b/website/web/__init__.py index be474967..441d33f7 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -417,13 +417,13 @@ def get_url_investigator(url: str) -> list[tuple[str, str, str, datetime, set[st ) for cache in cached_captures] -def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]: +def get_cookie_name_investigator(cookie_name: str, /) -> list[tuple[str, str, datetime, set[str]]]: '''Returns all the captures related to a cookie name entry, used in the web interface.''' - cached_captures = lookyloo.sorted_capture_cache([entry[0] for entry in get_indexing(flask_login.current_user).get_cookies_names_captures(cookie_name)]) - captures = [(cache.uuid, cache.title) for cache in cached_captures] - domains = [(domain, freq, get_indexing(flask_login.current_user).cookies_names_domains_values(cookie_name, domain)) - for domain, freq in get_indexing(flask_login.current_user).get_cookie_domains(cookie_name)] - return captures, domains + cached_captures = lookyloo.sorted_capture_cache( + [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_cookies_name(cookie_name=cookie_name)], + cached_captures_only=True) + captures = [(cache.uuid, cache.title, cache.timestamp, get_indexing(flask_login.current_user).get_capture_cookie_name_nodes(cache.uuid, cookie_name)) for cache in cached_captures] + return captures def get_identifier_investigator(identifier_type: str, identifier: str) -> list[tuple[str, str, str, datetime]]: @@ -1420,8 +1420,9 @@ def index_hidden() -> str: @app.route('/cookies', methods=['GET']) def cookies_lookup() -> str: - cookies_names = [(name, freq, get_indexing(flask_login.current_user).cookies_names_number_domains(name)) - for name, freq in get_indexing(flask_login.current_user).cookies_names] + cookies_names = [] + for name in get_indexing(flask_login.current_user).cookies_names: + cookies_names.append((name, get_indexing(flask_login.current_user).get_captures_cookie_name_count(name))) return render_template('cookies.html', cookies_names=cookies_names) @@ -1737,8 +1738,8 @@ def simple_capture() -> str | Response | WerkzeugResponse: @app.route('/cookies/', methods=['GET']) def cookies_name_detail(cookie_name: str) -> str: - captures, domains = get_cookie_name_investigator(cookie_name.strip()) - return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures) + captures = get_cookie_name_investigator(cookie_name.strip()) + return render_template('cookie_name.html', cookie_name=cookie_name, captures=captures) @app.route('/hhhdetails/', methods=['GET']) diff --git a/website/web/templates/cookie_name.html b/website/web/templates/cookie_name.html index 9ce295ae..6784869d 100644 --- a/website/web/templates/cookie_name.html +++ b/website/web/templates/cookie_name.html @@ -1,67 +1,66 @@ {% extends "main.html" %} -{% from 'bootstrap5/utils.html' import render_messages %} - -{% block title %}{{ cookie_name }}{% endblock %} +{% from "macros.html" import shorten_string %} {% block scripts %} {{ super() }} - - {% endblock %} {% block content %} -
-

{{ cookie_name }}

- -
-
- - - - - - - - - - {% for domain, freq, values in domains %} - - - - - - {% endfor %} - -
Domain nameFrequencyValue
- {{ domain }} - {{ freq }} -
    - {% for value, freq in values %} -
  • {{ value }} - {{ freq }}
  • - {% endfor %} -
-
-
-

A cookie with that name was seen in these captures:

- +
+

{{ cookie_name }}

+
Only the most recent captures are listed below, this will change soon.
+
+ + + + + + + + + + {% for capture_uuid, title, capture_time, nodes in captures %} + + + + + {% endfor %} + +
Capture TimeCapture Title
+ {{capture_time}} + + + {{ title }} + +
+ The capture contains this URL in {{ nodes|length }} nodes, click below to see them on the tree: +
    + {% for node in nodes %} +
  • + +
  • + {% endfor %} +
+
{% endblock %}