diff --git a/bin/rebuild_caches.py b/bin/rebuild_caches.py index 060488ff..1e7338ce 100755 --- a/bin/rebuild_caches.py +++ b/bin/rebuild_caches.py @@ -24,6 +24,15 @@ if __name__ == '__main__': indexing = Indexing() indexing.clear_indexes() for capture_dir in lookyloo.capture_dirs: - tree = load_pickle_tree(capture_dir) - indexing.index_cookies_capture(tree) - indexing.index_body_hashes_capture(tree) + try: + tree = load_pickle_tree(capture_dir) + except Exception as e: + print(capture_dir, e) + if tree: + indexing.index_cookies_capture(tree) + indexing.index_body_hashes_capture(tree) + else: + try: + lookyloo.cache_tree(capture_dir=capture_dir) + except Exception as e: + print(capture_dir, e) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 4e0bb0da..9bb4bc6d 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -20,6 +20,7 @@ from urllib.parse import urlsplit from uuid import uuid4 from zipfile import ZipFile +import publicsuffix2 # type: ignore from defang import refang # type: ignore from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode from redis import Redis @@ -209,14 +210,24 @@ class Lookyloo(): with self_generated_ua_file.open('w') as f: json.dump(to_store, f, indent=2) - def cache_tree(self, capture_uuid) -> None: - capture_dir = self.lookup_capture_dir(capture_uuid) - if not capture_dir: - raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + def cache_tree(self, capture_uuid: Optional[str]=None, capture_dir: Optional[Union[str, Path]]=None) -> None: + c_dir = None + if capture_uuid: + c_dir = self.lookup_capture_dir(capture_uuid) + if not c_dir: + raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + elif capture_dir: + if isinstance(capture_dir, str): + c_dir = Path(capture_dir) + else: + c_dir = capture_dir - with open((capture_dir / 'uuid'), 'r') as f: + if not c_dir: + raise Exception('Need either capture_uuid or capture_dir') + + with open((c_dir / 'uuid'), 'r') as f: uuid = f.read() - har_files = sorted(capture_dir.glob('*.har')) + har_files = sorted(c_dir.glob('*.har')) try: ct = CrawledTree(har_files, uuid) self.indexing.index_cookies_capture(ct) @@ -224,7 +235,7 @@ class Lookyloo(): except Har2TreeError as e: raise NoValidHarFile(e.message) - with (capture_dir / 'tree.pickle').open('wb') as _p: + with (c_dir / 'tree.pickle').open('wb') as _p: pickle.dump(ct, _p) def get_crawled_tree(self, capture_uuid: str) -> CrawledTree: