mirror of https://github.com/CIRCL/lookyloo
				
				
				
			new: Integrate categorization in indexing
							parent
							
								
									9243f84295
								
							
						
					
					
						commit
						354f269218
					
				|  | @ -9,7 +9,3 @@ updates: | |||
|     directory: "/" | ||||
|     schedule: | ||||
|       interval: "daily" | ||||
|   - package-ecosystem: "pip" | ||||
|     directory: "/client/" | ||||
|     schedule: | ||||
|       interval: "daily" | ||||
|  |  | |||
|  | @ -41,6 +41,8 @@ def main(): | |||
|             indexing.index_cookies_capture(tree) | ||||
|             indexing.index_body_hashes_capture(tree) | ||||
|             indexing.index_url_capture(tree) | ||||
|             categories = list(lookyloo.categories_capture(capture_uuid).keys()) | ||||
|             indexing.index_categories_capture(capture_uuid, categories) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| 
 | ||||
| import hashlib | ||||
| from urllib.parse import urlsplit | ||||
| from typing import List, Tuple, Set, Dict, Optional | ||||
| from typing import List, Tuple, Set, Dict, Optional, Iterable | ||||
| from collections import defaultdict | ||||
| 
 | ||||
| from redis import Redis | ||||
|  | @ -192,3 +192,29 @@ class Indexing(): | |||
| 
 | ||||
|     def get_captures_hostname(self, hostname: str) -> Set[str]: | ||||
|         return self.redis.smembers(f'hostnames|{hostname}|captures')  # type: ignore | ||||
| 
 | ||||
|     # ###### Categories ###### | ||||
| 
 | ||||
|     @property | ||||
|     def categories(self) -> List[Tuple[str, int]]: | ||||
|         return [(c, int(score)) | ||||
|                 for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)] | ||||
| 
 | ||||
|     def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]): | ||||
|         if not categories: | ||||
|             return | ||||
|         print(capture_uuid, categories) | ||||
|         if self.redis.sismember('indexed_categories', capture_uuid): | ||||
|             # do not reindex | ||||
|             return | ||||
|         self.redis.sadd('indexed_categories', capture_uuid) | ||||
|         if not categories: | ||||
|             return | ||||
|         pipeline = self.redis.pipeline() | ||||
|         for category in categories: | ||||
|             pipeline.zincrby('categories', 1, category) | ||||
|             pipeline.sadd(category, capture_uuid) | ||||
|         pipeline.execute() | ||||
| 
 | ||||
|     def get_captures_category(self, category: str) -> Set[str]: | ||||
|         return self.redis.smembers(category)  # type: ignore | ||||
|  |  | |||
|  | @ -133,6 +133,8 @@ class Lookyloo(): | |||
|                 self.indexing.index_cookies_capture(ct) | ||||
|                 self.indexing.index_body_hashes_capture(ct) | ||||
|                 self.indexing.index_url_capture(ct) | ||||
|                 categories = list(self.categories_capture(capture_uuid).keys()) | ||||
|                 self.indexing.index_categories_capture(capture_uuid, categories) | ||||
|         except Har2TreeError as e: | ||||
|             raise NoValidHarFile(e.message) | ||||
| 
 | ||||
|  | @ -272,7 +274,7 @@ class Lookyloo(): | |||
|             return {} | ||||
|         return ct.root_hartree.stats | ||||
| 
 | ||||
|     def categories_capture(self, capture_uuid: str): | ||||
|     def categories_capture(self, capture_uuid: str) -> Dict[str, Any]: | ||||
|         capture_dir = self.lookup_capture_dir(capture_uuid) | ||||
|         if not capture_dir: | ||||
|             raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') | ||||
|  | @ -280,11 +282,10 @@ class Lookyloo(): | |||
|         if (capture_dir / 'categories').exists(): | ||||
|             with (capture_dir / 'categories').open() as f: | ||||
|                 current_categories = [line.strip() for line in f.readlines()] | ||||
|         else: | ||||
|             current_categories = [] | ||||
|         return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} | ||||
|             return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} | ||||
|         return {} | ||||
| 
 | ||||
|     def categorize_capture(self, capture_uuid: str, category: str): | ||||
|     def categorize_capture(self, capture_uuid: str, category: str) -> None: | ||||
|         if not get_config('generic', 'enable_categorization'): | ||||
|             return | ||||
|         # Make sure the category is mappable to a taxonomy. | ||||
|  | @ -303,7 +304,7 @@ class Lookyloo(): | |||
|         with (capture_dir / 'categories').open('w') as f: | ||||
|             f.writelines(f'{t}\n' for t in current_categories) | ||||
| 
 | ||||
|     def uncategorize_capture(self, capture_uuid: str, category: str): | ||||
|     def uncategorize_capture(self, capture_uuid: str, category: str) -> None: | ||||
|         if not get_config('generic', 'enable_categorization'): | ||||
|             return | ||||
|         capture_dir = self.lookup_capture_dir(capture_uuid) | ||||
|  | @ -382,7 +383,7 @@ class Lookyloo(): | |||
|         error_cache: Dict[str, str] = {} | ||||
|         if (capture_dir / 'error.txt').exists(): | ||||
|             # Something went wrong | ||||
|             with (Path(capture_dir) / 'error.txt').open() as _error: | ||||
|             with (capture_dir / 'error.txt').open() as _error: | ||||
|                 content = _error.read() | ||||
|                 try: | ||||
|                     error_to_cache = json.loads(content) | ||||
|  | @ -404,6 +405,12 @@ class Lookyloo(): | |||
|             error_cache['error'] = f'No har files in {capture_dir.name}' | ||||
|             fatal_error = True | ||||
| 
 | ||||
|         if (capture_dir / 'categories').exists(): | ||||
|             with (capture_dir / 'categories').open() as _categories: | ||||
|                 categories = [c.strip() for c in _categories.readlines()] | ||||
|         else: | ||||
|             categories = [] | ||||
| 
 | ||||
|         if not redis_pipeline: | ||||
|             p = self.redis.pipeline() | ||||
|         else: | ||||
|  | @ -430,6 +437,7 @@ class Lookyloo(): | |||
|                                                  'timestamp': har.initial_start_time, | ||||
|                                                  'url': har.root_url, | ||||
|                                                  'redirects': json.dumps(redirects), | ||||
|                                                  'categories': json.dumps(categories), | ||||
|                                                  'capture_dir': str(capture_dir), | ||||
|                                                  'incomplete_redirects': 1 if incomplete_redirects else 0} | ||||
|             if (capture_dir / 'no_index').exists():  # If the folders claims anonymity | ||||
|  | @ -473,6 +481,8 @@ class Lookyloo(): | |||
|                 continue | ||||
|             if 'timestamp' not in c: | ||||
|                 continue | ||||
|             if 'categories' in c: | ||||
|                 c['categories'] = json.loads(c['categories']) | ||||
|             all_cache.append(c) | ||||
|         return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True) | ||||
| 
 | ||||
|  | @ -487,6 +497,8 @@ class Lookyloo(): | |||
|         if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']): | ||||
|             cached['redirects'] = json.loads(cached['redirects'])  # type: ignore | ||||
|             cached['capture_dir'] = Path(cached['capture_dir']) | ||||
|             if 'categories' in cached: | ||||
|                 cached['categories'] = json.loads(cached['categories'])  # type: ignore | ||||
|             return cached | ||||
|         elif 'error' in cached: | ||||
|             return cached | ||||
|  | @ -639,9 +651,9 @@ class Lookyloo(): | |||
|         return self._get_raw(capture_uuid) | ||||
| 
 | ||||
|     def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, | ||||
|                depth: int=1, listing: bool=True, user_agent: Optional[str]=None, | ||||
|                referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, | ||||
|                browser: Optional[str]=None) -> Union[bool, str]: | ||||
|                 depth: int=1, listing: bool=True, user_agent: Optional[str]=None, | ||||
|                 referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, | ||||
|                 browser: Optional[str]=None) -> Union[bool, str]: | ||||
|         url = url.strip() | ||||
|         url = refang(url) | ||||
|         if not url.startswith('http'): | ||||
|  |  | |||
|  | @ -368,16 +368,20 @@ def mark_as_legitimate(tree_uuid: str): | |||
| 
 | ||||
| # ##### helpers ##### | ||||
| 
 | ||||
| def index_generic(show_hidden: bool=False): | ||||
| def index_generic(show_hidden: bool=False, category: Optional[str]=None): | ||||
|     titles = [] | ||||
|     if time_delta_on_index: | ||||
|         # We want to filter the captures on the index | ||||
|         cut_time = datetime.now() - timedelta(**time_delta_on_index) | ||||
|     else: | ||||
|         cut_time = None  # type: ignore | ||||
| 
 | ||||
|     for cached in lookyloo.sorted_cache: | ||||
|         if not cached: | ||||
|             continue | ||||
|         if category: | ||||
|             if 'categories' not in cached or category not in cached['categories']: | ||||
|                 continue | ||||
|         if show_hidden: | ||||
|             if 'no_index' not in cached: | ||||
|                 # Only display the hidden ones | ||||
|  | @ -386,6 +390,7 @@ def index_generic(show_hidden: bool=False): | |||
|             continue | ||||
|         if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time: | ||||
|             continue | ||||
| 
 | ||||
|         titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'], | ||||
|                        cached['redirects'], True if cached['incomplete_redirects'] == '1' else False)) | ||||
|     titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True) | ||||
|  | @ -409,6 +414,11 @@ def index_hidden(): | |||
|     return index_generic(show_hidden=True) | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/category/<string:category>', methods=['GET']) | ||||
| def index_category(category: str): | ||||
|     return index_generic(category=category) | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/cookies', methods=['GET']) | ||||
| def cookies_lookup(): | ||||
|     i = Indexing() | ||||
|  | @ -428,6 +438,13 @@ def ressources(): | |||
|     return render_template('ressources.html', ressources=ressources) | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/categories', methods=['GET']) | ||||
| def categories(): | ||||
|     i = Indexing() | ||||
|     print(i.categories) | ||||
|     return render_template('categories.html', categories=i.categories) | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/rebuild_all') | ||||
| @auth.login_required | ||||
| def rebuild_all(): | ||||
|  |  | |||
|  | @ -0,0 +1,46 @@ | |||
| {% extends "main.html" %} | ||||
| 
 | ||||
| {% from 'bootstrap/utils.html' import render_messages %} | ||||
| 
 | ||||
| {% block title %}Categories{% endblock %} | ||||
| 
 | ||||
| {% block scripts %} | ||||
| {{ super() }} | ||||
| <script src='{{ url_for('static', filename='datatables.min.js') }}'></script> | ||||
| <script type="text/javascript"> | ||||
|     $('#table').DataTable( { | ||||
|         "order": [[ 1, "desc" ]], | ||||
|         "pageLength": 500 | ||||
|     }); | ||||
| </script> | ||||
| 
 | ||||
| {% endblock %} | ||||
| 
 | ||||
| {% block styles %} | ||||
| {{ super() }} | ||||
| <link rel="stylesheet" href="{{ url_for('static', filename='datatables.min.css') }}"> | ||||
| {% endblock %} | ||||
| 
 | ||||
| 
 | ||||
| {% block content %} | ||||
|   <div class="table-responsive"> | ||||
|   <table id="table" class="table" style="width:96%"> | ||||
|     <thead> | ||||
|      <tr> | ||||
|        <th>Category</th> | ||||
|        <th>Frequency</th> | ||||
|      </tr> | ||||
|     </thead> | ||||
|     <tbody> | ||||
|       {% for category, freq in categories %} | ||||
|       <tr> | ||||
|         <td> | ||||
|           <a href="{{ url_for('index_category', category=category) }}">{{ category }}</a> | ||||
|         </td> | ||||
|         <td>{{ freq }}</td> | ||||
|       </tr> | ||||
|       {% endfor %} | ||||
|     </tbody> | ||||
|   </table> | ||||
|   </div> | ||||
| {% endblock %} | ||||
		Loading…
	
		Reference in New Issue
	
	 Raphaël Vinot
						Raphaël Vinot