fix: Rebuild caches when tree doesn't exists

pull/78/head
Raphaël Vinot 2020-07-01 02:19:52 +02:00
parent 23419a31b9
commit 0c5501016c
2 changed files with 30 additions and 10 deletions

View File

@ -24,6 +24,15 @@ if __name__ == '__main__':
indexing = Indexing()
indexing.clear_indexes()
for capture_dir in lookyloo.capture_dirs:
try:
tree = load_pickle_tree(capture_dir)
except Exception as e:
print(capture_dir, e)
if tree:
indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree)
else:
try:
lookyloo.cache_tree(capture_dir=capture_dir)
except Exception as e:
print(capture_dir, e)

View File

@ -20,6 +20,7 @@ from urllib.parse import urlsplit
from uuid import uuid4
from zipfile import ZipFile
import publicsuffix2 # type: ignore
from defang import refang # type: ignore
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
from redis import Redis
@ -209,14 +210,24 @@ class Lookyloo():
with self_generated_ua_file.open('w') as f:
json.dump(to_store, f, indent=2)
def cache_tree(self, capture_uuid) -> None:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
def cache_tree(self, capture_uuid: Optional[str]=None, capture_dir: Optional[Union[str, Path]]=None) -> None:
c_dir = None
if capture_uuid:
c_dir = self.lookup_capture_dir(capture_uuid)
if not c_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
elif capture_dir:
if isinstance(capture_dir, str):
c_dir = Path(capture_dir)
else:
c_dir = capture_dir
with open((capture_dir / 'uuid'), 'r') as f:
if not c_dir:
raise Exception('Need either capture_uuid or capture_dir')
with open((c_dir / 'uuid'), 'r') as f:
uuid = f.read()
har_files = sorted(capture_dir.glob('*.har'))
har_files = sorted(c_dir.glob('*.har'))
try:
ct = CrawledTree(har_files, uuid)
self.indexing.index_cookies_capture(ct)
@ -224,7 +235,7 @@ class Lookyloo():
except Har2TreeError as e:
raise NoValidHarFile(e.message)
with (capture_dir / 'tree.pickle').open('wb') as _p:
with (c_dir / 'tree.pickle').open('wb') as _p:
pickle.dump(ct, _p)
def get_crawled_tree(self, capture_uuid: str) -> CrawledTree: