diff --git a/bin/rebuild_caches.py b/bin/rebuild_caches.py index 83ccafb5..6ec6fc51 100755 --- a/bin/rebuild_caches.py +++ b/bin/rebuild_caches.py @@ -23,11 +23,19 @@ if __name__ == '__main__': indexing = Indexing() indexing.clear_indexes() for capture_uuid in lookyloo.capture_uuids: + index = True try: tree = lookyloo.get_crawled_tree(capture_uuid) except Exception as e: print(capture_uuid, e) continue + + if lookyloo.is_public_instance: + cache = lookyloo.capture_cache(capture_uuid) + if cache.get('no_index') is not None: + index = False + # NOTE: these two methods do nothing if we just generated the pickle - indexing.index_cookies_capture(tree) - indexing.index_body_hashes_capture(tree) + if index: + indexing.index_cookies_capture(tree) + indexing.index_body_hashes_capture(tree) diff --git a/config/generic.json.sample b/config/generic.json.sample index 065864bd..8d5eebfb 100644 --- a/config/generic.json.sample +++ b/config/generic.json.sample @@ -2,6 +2,7 @@ "loglevel": "INFO", "splash_loglevel": "WARNING", "only_global_lookups": true, + "public_instance": false, "splash_url": "http://127.0.0.1:8050", "default_user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36", "cache_clean_user": {}, @@ -24,6 +25,7 @@ "_notes": { "loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels", "splash_loglevel": "(Splash) INFO is *very* verbose.", + "public_instance": "true means disabling features deemed unsafe on a public instance (such as indexing private captures)", "only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network", "splash_url": "URL to connect to splash", "default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA", diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 9eccaa05..b598ad87 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -154,6 +154,7 @@ class Lookyloo(): self.configs: Dict[str, Dict[str, Any]] = load_configs() self.logger.setLevel(self.get_config('loglevel')) self.indexing = Indexing() + self.is_public_instance = self.get_config('public_instance') self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir: Path = get_homedir() / 'scraped' @@ -219,6 +220,7 @@ class Lookyloo(): json.dump(to_store, f, indent=2) def cache_tree(self, capture_uuid: str) -> None: + '''Generate the pickle, add capture in the indexes''' capture_dir = self.lookup_capture_dir(capture_uuid) if not capture_dir: raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') @@ -226,10 +228,17 @@ class Lookyloo(): with open((capture_dir / 'uuid'), 'r') as f: uuid = f.read() har_files = sorted(capture_dir.glob('*.har')) + # NOTE: We only index the public captures + index = True try: ct = CrawledTree(har_files, uuid) - self.indexing.index_cookies_capture(ct) - self.indexing.index_body_hashes_capture(ct) + if self.is_public_instance: + cache = self.capture_cache(capture_uuid) + if cache.get('no_index') is not None: + index = False + if index: + self.indexing.index_cookies_capture(ct) + self.indexing.index_body_hashes_capture(ct) except Har2TreeError as e: raise NoValidHarFile(e.message)