From 354f269218f13acdf4003ad388e083f3ed9ac767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 9 Nov 2020 16:02:54 +0100 Subject: [PATCH] new: Integrate categorization in indexing --- .github/dependabot.yml | 4 --- bin/rebuild_caches.py | 2 ++ lookyloo/indexing.py | 28 +++++++++++++++- lookyloo/lookyloo.py | 32 +++++++++++++------ website/web/__init__.py | 19 ++++++++++- website/web/templates/categories.html | 46 +++++++++++++++++++++++++++ 6 files changed, 115 insertions(+), 16 deletions(-) create mode 100644 website/web/templates/categories.html diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 1501d1fe..44cece1e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -9,7 +9,3 @@ updates: directory: "/" schedule: interval: "daily" - - package-ecosystem: "pip" - directory: "/client/" - schedule: - interval: "daily" diff --git a/bin/rebuild_caches.py b/bin/rebuild_caches.py index 04841de0..23600173 100755 --- a/bin/rebuild_caches.py +++ b/bin/rebuild_caches.py @@ -41,6 +41,8 @@ def main(): indexing.index_cookies_capture(tree) indexing.index_body_hashes_capture(tree) indexing.index_url_capture(tree) + categories = list(lookyloo.categories_capture(capture_uuid).keys()) + indexing.index_categories_capture(capture_uuid, categories) if __name__ == '__main__': diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index ed1abc79..3f0a2061 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -3,7 +3,7 @@ import hashlib from urllib.parse import urlsplit -from typing import List, Tuple, Set, Dict, Optional +from typing import List, Tuple, Set, Dict, Optional, Iterable from collections import defaultdict from redis import Redis @@ -192,3 +192,29 @@ class Indexing(): def get_captures_hostname(self, hostname: str) -> Set[str]: return self.redis.smembers(f'hostnames|{hostname}|captures') # type: ignore + + # ###### Categories ###### + + @property + def categories(self) -> List[Tuple[str, int]]: + return [(c, int(score)) + for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)] + + def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]): + if not categories: + return + print(capture_uuid, categories) + if self.redis.sismember('indexed_categories', capture_uuid): + # do not reindex + return + self.redis.sadd('indexed_categories', capture_uuid) + if not categories: + return + pipeline = self.redis.pipeline() + for category in categories: + pipeline.zincrby('categories', 1, category) + pipeline.sadd(category, capture_uuid) + pipeline.execute() + + def get_captures_category(self, category: str) -> Set[str]: + return self.redis.smembers(category) # type: ignore diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 4925122d..e9a9b0b5 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -133,6 +133,8 @@ class Lookyloo(): self.indexing.index_cookies_capture(ct) self.indexing.index_body_hashes_capture(ct) self.indexing.index_url_capture(ct) + categories = list(self.categories_capture(capture_uuid).keys()) + self.indexing.index_categories_capture(capture_uuid, categories) except Har2TreeError as e: raise NoValidHarFile(e.message) @@ -272,7 +274,7 @@ class Lookyloo(): return {} return ct.root_hartree.stats - def categories_capture(self, capture_uuid: str): + def categories_capture(self, capture_uuid: str) -> Dict[str, Any]: capture_dir = self.lookup_capture_dir(capture_uuid) if not capture_dir: raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') @@ -280,11 +282,10 @@ class Lookyloo(): if (capture_dir / 'categories').exists(): with (capture_dir / 'categories').open() as f: current_categories = [line.strip() for line in f.readlines()] - else: - current_categories = [] - return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} + return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} + return {} - def categorize_capture(self, capture_uuid: str, category: str): + def categorize_capture(self, capture_uuid: str, category: str) -> None: if not get_config('generic', 'enable_categorization'): return # Make sure the category is mappable to a taxonomy. @@ -303,7 +304,7 @@ class Lookyloo(): with (capture_dir / 'categories').open('w') as f: f.writelines(f'{t}\n' for t in current_categories) - def uncategorize_capture(self, capture_uuid: str, category: str): + def uncategorize_capture(self, capture_uuid: str, category: str) -> None: if not get_config('generic', 'enable_categorization'): return capture_dir = self.lookup_capture_dir(capture_uuid) @@ -382,7 +383,7 @@ class Lookyloo(): error_cache: Dict[str, str] = {} if (capture_dir / 'error.txt').exists(): # Something went wrong - with (Path(capture_dir) / 'error.txt').open() as _error: + with (capture_dir / 'error.txt').open() as _error: content = _error.read() try: error_to_cache = json.loads(content) @@ -404,6 +405,12 @@ class Lookyloo(): error_cache['error'] = f'No har files in {capture_dir.name}' fatal_error = True + if (capture_dir / 'categories').exists(): + with (capture_dir / 'categories').open() as _categories: + categories = [c.strip() for c in _categories.readlines()] + else: + categories = [] + if not redis_pipeline: p = self.redis.pipeline() else: @@ -430,6 +437,7 @@ class Lookyloo(): 'timestamp': har.initial_start_time, 'url': har.root_url, 'redirects': json.dumps(redirects), + 'categories': json.dumps(categories), 'capture_dir': str(capture_dir), 'incomplete_redirects': 1 if incomplete_redirects else 0} if (capture_dir / 'no_index').exists(): # If the folders claims anonymity @@ -473,6 +481,8 @@ class Lookyloo(): continue if 'timestamp' not in c: continue + if 'categories' in c: + c['categories'] = json.loads(c['categories']) all_cache.append(c) return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True) @@ -487,6 +497,8 @@ class Lookyloo(): if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']): cached['redirects'] = json.loads(cached['redirects']) # type: ignore cached['capture_dir'] = Path(cached['capture_dir']) + if 'categories' in cached: + cached['categories'] = json.loads(cached['categories']) # type: ignore return cached elif 'error' in cached: return cached @@ -639,9 +651,9 @@ class Lookyloo(): return self._get_raw(capture_uuid) def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, - depth: int=1, listing: bool=True, user_agent: Optional[str]=None, - referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, - browser: Optional[str]=None) -> Union[bool, str]: + depth: int=1, listing: bool=True, user_agent: Optional[str]=None, + referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, + browser: Optional[str]=None) -> Union[bool, str]: url = url.strip() url = refang(url) if not url.startswith('http'): diff --git a/website/web/__init__.py b/website/web/__init__.py index aa3d8b3a..7cb412bb 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -368,16 +368,20 @@ def mark_as_legitimate(tree_uuid: str): # ##### helpers ##### -def index_generic(show_hidden: bool=False): +def index_generic(show_hidden: bool=False, category: Optional[str]=None): titles = [] if time_delta_on_index: # We want to filter the captures on the index cut_time = datetime.now() - timedelta(**time_delta_on_index) else: cut_time = None # type: ignore + for cached in lookyloo.sorted_cache: if not cached: continue + if category: + if 'categories' not in cached or category not in cached['categories']: + continue if show_hidden: if 'no_index' not in cached: # Only display the hidden ones @@ -386,6 +390,7 @@ def index_generic(show_hidden: bool=False): continue if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time: continue + titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'], cached['redirects'], True if cached['incomplete_redirects'] == '1' else False)) titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True) @@ -409,6 +414,11 @@ def index_hidden(): return index_generic(show_hidden=True) +@app.route('/category/', methods=['GET']) +def index_category(category: str): + return index_generic(category=category) + + @app.route('/cookies', methods=['GET']) def cookies_lookup(): i = Indexing() @@ -428,6 +438,13 @@ def ressources(): return render_template('ressources.html', ressources=ressources) +@app.route('/categories', methods=['GET']) +def categories(): + i = Indexing() + print(i.categories) + return render_template('categories.html', categories=i.categories) + + @app.route('/rebuild_all') @auth.login_required def rebuild_all(): diff --git a/website/web/templates/categories.html b/website/web/templates/categories.html new file mode 100644 index 00000000..cd0c4f2f --- /dev/null +++ b/website/web/templates/categories.html @@ -0,0 +1,46 @@ +{% extends "main.html" %} + +{% from 'bootstrap/utils.html' import render_messages %} + +{% block title %}Categories{% endblock %} + +{% block scripts %} +{{ super() }} + + + +{% endblock %} + +{% block styles %} +{{ super() }} + +{% endblock %} + + +{% block content %} +
+ + + + + + + + + {% for category, freq in categories %} + + + + + {% endfor %} + +
CategoryFrequency
+ {{ category }} + {{ freq }}
+
+{% endblock %}