From 5f329e4d7bcf6efab00820ea79282acda3f33eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 12 Jul 2022 18:44:33 +0200 Subject: [PATCH] new: compress HAR files in archived captures. --- bin/archiver.py | 29 ++++++++++++++++++++++------- lookyloo/capturecache.py | 7 +++---- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/bin/archiver.py b/bin/archiver.py index f80dcbe3..f9e53a79 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3 import csv +import gzip import logging +import shutil + from collections import defaultdict from collections.abc import Mapping from datetime import datetime, timedelta @@ -34,6 +37,7 @@ class Archiver(AbstractManager): self._archive() self._update_all_capture_indexes() self._load_indexes() + self._compress_hars() def _update_index(self, root_dir: Path) -> None: current_index: Dict[str, str] = {} @@ -72,12 +76,12 @@ class Archiver(AbstractManager): def _update_all_capture_indexes(self): '''Run that after the captures are in the proper directories''' # Recent captures - directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid')} + directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')} for directory_to_index in directories_to_index: self._update_index(directory_to_index) # Archived captures - directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid')} + directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')} for directory_to_index in directories_to_index: self._update_index(directory_to_index) @@ -89,7 +93,7 @@ class Archiver(AbstractManager): # Format: # { 2020: { 12: [(directory, uuid)] } } to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list)) - for capture_uuid in get_captures_dir().glob('**/uuid'): + for capture_uuid in get_captures_dir().rglob('uuid'): timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') if timestamp.date() >= cut_time: continue @@ -107,16 +111,27 @@ class Archiver(AbstractManager): dest_dir.mkdir(parents=True, exist_ok=True) for capture_path in captures: p.delete(str(capture_path)) + (capture_path / 'tree.pickle').unlink(missing_ok=True) capture_path.rename(dest_dir / capture_path.name) p.execute() - # Clear empty - self.logger.info('Archiving done.') + def _compress_hars(self): + for index in self.archived_captures_dir.rglob('index'): + with index.open('r') as _f: + for uuid, dirname in csv.reader(_f): + for har in (index.parent / dirname).glob('*.har'): + if not har.exists(): + continue + with har.open('rb') as f_in: + with gzip.open(f'{har}.gz', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + har.unlink() + def _load_indexes(self): # Initialize archives - for index in get_captures_dir().glob('**/index'): + for index in get_captures_dir().rglob('index'): with index.open('r') as _f: recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()} if recent_uuids: @@ -125,7 +140,7 @@ class Archiver(AbstractManager): index.unlink() # Initialize archives - for index in self.archived_captures_dir.glob('**/index'): + for index in self.archived_captures_dir.rglob('index'): with index.open('r') as _f: archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()} if archived_uuids: diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index f930816b..b59cecfd 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -200,8 +200,7 @@ class CapturesIndex(Mapping): time.sleep(5) return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) - har_files = sorted(capture_dir.glob('*.har')) - pickle_file = capture_dir / 'tree.pickle' + har_files = sorted(capture_dir.glob('*.har*')) try: tree = CrawledTree(har_files, uuid) self.__resolve_dns(tree) @@ -212,7 +211,7 @@ class CapturesIndex(Mapping): except RecursionError as e: raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') else: - with pickle_file.open('wb') as _p: + with (capture_dir / 'tree.pickle').open('wb') as _p: # Some pickles require a pretty high recursion limit, this kindof fixes it. # If the capture is really broken (generally a refresh to self), the capture # is discarded in the RecursionError above. @@ -247,7 +246,7 @@ class CapturesIndex(Mapping): error_to_cache = content cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}' - if (har_files := sorted(capture_dir.glob('*.har'))): + if (har_files := sorted(capture_dir.glob('*.har*'))): try: har = HarFile(har_files[0], uuid) cache['title'] = har.initial_title