new: compress HAR files in archived captures.

pull/460/head
Raphaël Vinot 2022-07-12 18:44:33 +02:00
parent 2caa19aec0
commit 5f329e4d7b
2 changed files with 25 additions and 11 deletions

View File

@ -1,7 +1,10 @@
#!/usr/bin/env python3
import csv
import gzip
import logging
import shutil
from collections import defaultdict
from collections.abc import Mapping
from datetime import datetime, timedelta
@ -34,6 +37,7 @@ class Archiver(AbstractManager):
self._archive()
self._update_all_capture_indexes()
self._load_indexes()
self._compress_hars()
def _update_index(self, root_dir: Path) -> None:
current_index: Dict[str, str] = {}
@ -72,12 +76,12 @@ class Archiver(AbstractManager):
def _update_all_capture_indexes(self):
'''Run that after the captures are in the proper directories'''
# Recent captures
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid')}
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}
for directory_to_index in directories_to_index:
self._update_index(directory_to_index)
# Archived captures
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid')}
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}
for directory_to_index in directories_to_index:
self._update_index(directory_to_index)
@ -89,7 +93,7 @@ class Archiver(AbstractManager):
# Format:
# { 2020: { 12: [(directory, uuid)] } }
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
for capture_uuid in get_captures_dir().glob('**/uuid'):
for capture_uuid in get_captures_dir().rglob('uuid'):
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
if timestamp.date() >= cut_time:
continue
@ -107,16 +111,27 @@ class Archiver(AbstractManager):
dest_dir.mkdir(parents=True, exist_ok=True)
for capture_path in captures:
p.delete(str(capture_path))
(capture_path / 'tree.pickle').unlink(missing_ok=True)
capture_path.rename(dest_dir / capture_path.name)
p.execute()
# Clear empty
self.logger.info('Archiving done.')
def _compress_hars(self):
for index in self.archived_captures_dir.rglob('index'):
with index.open('r') as _f:
for uuid, dirname in csv.reader(_f):
for har in (index.parent / dirname).glob('*.har'):
if not har.exists():
continue
with har.open('rb') as f_in:
with gzip.open(f'{har}.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
har.unlink()
def _load_indexes(self):
# Initialize archives
for index in get_captures_dir().glob('**/index'):
for index in get_captures_dir().rglob('index'):
with index.open('r') as _f:
recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
if recent_uuids:
@ -125,7 +140,7 @@ class Archiver(AbstractManager):
index.unlink()
# Initialize archives
for index in self.archived_captures_dir.glob('**/index'):
for index in self.archived_captures_dir.rglob('index'):
with index.open('r') as _f:
archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
if archived_uuids:

View File

@ -200,8 +200,7 @@ class CapturesIndex(Mapping):
time.sleep(5)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
har_files = sorted(capture_dir.glob('*.har'))
pickle_file = capture_dir / 'tree.pickle'
har_files = sorted(capture_dir.glob('*.har*'))
try:
tree = CrawledTree(har_files, uuid)
self.__resolve_dns(tree)
@ -212,7 +211,7 @@ class CapturesIndex(Mapping):
except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
else:
with pickle_file.open('wb') as _p:
with (capture_dir / 'tree.pickle').open('wb') as _p:
# Some pickles require a pretty high recursion limit, this kindof fixes it.
# If the capture is really broken (generally a refresh to self), the capture
# is discarded in the RecursionError above.
@ -247,7 +246,7 @@ class CapturesIndex(Mapping):
error_to_cache = content
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
if (har_files := sorted(capture_dir.glob('*.har'))):
if (har_files := sorted(capture_dir.glob('*.har*'))):
try:
har = HarFile(har_files[0], uuid)
cache['title'] = har.initial_title