new: compress HAR files in archived captures.

pull/460/head
Raphaël Vinot 2022-07-12 18:44:33 +02:00
parent 2caa19aec0
commit 5f329e4d7b
2 changed files with 25 additions and 11 deletions

View File

@ -1,7 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import csv import csv
import gzip
import logging import logging
import shutil
from collections import defaultdict from collections import defaultdict
from collections.abc import Mapping from collections.abc import Mapping
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -34,6 +37,7 @@ class Archiver(AbstractManager):
self._archive() self._archive()
self._update_all_capture_indexes() self._update_all_capture_indexes()
self._load_indexes() self._load_indexes()
self._compress_hars()
def _update_index(self, root_dir: Path) -> None: def _update_index(self, root_dir: Path) -> None:
current_index: Dict[str, str] = {} current_index: Dict[str, str] = {}
@ -72,12 +76,12 @@ class Archiver(AbstractManager):
def _update_all_capture_indexes(self): def _update_all_capture_indexes(self):
'''Run that after the captures are in the proper directories''' '''Run that after the captures are in the proper directories'''
# Recent captures # Recent captures
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid')} directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}
for directory_to_index in directories_to_index: for directory_to_index in directories_to_index:
self._update_index(directory_to_index) self._update_index(directory_to_index)
# Archived captures # Archived captures
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid')} directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}
for directory_to_index in directories_to_index: for directory_to_index in directories_to_index:
self._update_index(directory_to_index) self._update_index(directory_to_index)
@ -89,7 +93,7 @@ class Archiver(AbstractManager):
# Format: # Format:
# { 2020: { 12: [(directory, uuid)] } } # { 2020: { 12: [(directory, uuid)] } }
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list)) to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
for capture_uuid in get_captures_dir().glob('**/uuid'): for capture_uuid in get_captures_dir().rglob('uuid'):
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
if timestamp.date() >= cut_time: if timestamp.date() >= cut_time:
continue continue
@ -107,16 +111,27 @@ class Archiver(AbstractManager):
dest_dir.mkdir(parents=True, exist_ok=True) dest_dir.mkdir(parents=True, exist_ok=True)
for capture_path in captures: for capture_path in captures:
p.delete(str(capture_path)) p.delete(str(capture_path))
(capture_path / 'tree.pickle').unlink(missing_ok=True)
capture_path.rename(dest_dir / capture_path.name) capture_path.rename(dest_dir / capture_path.name)
p.execute() p.execute()
# Clear empty
self.logger.info('Archiving done.') self.logger.info('Archiving done.')
def _compress_hars(self):
for index in self.archived_captures_dir.rglob('index'):
with index.open('r') as _f:
for uuid, dirname in csv.reader(_f):
for har in (index.parent / dirname).glob('*.har'):
if not har.exists():
continue
with har.open('rb') as f_in:
with gzip.open(f'{har}.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
har.unlink()
def _load_indexes(self): def _load_indexes(self):
# Initialize archives # Initialize archives
for index in get_captures_dir().glob('**/index'): for index in get_captures_dir().rglob('index'):
with index.open('r') as _f: with index.open('r') as _f:
recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()} recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
if recent_uuids: if recent_uuids:
@ -125,7 +140,7 @@ class Archiver(AbstractManager):
index.unlink() index.unlink()
# Initialize archives # Initialize archives
for index in self.archived_captures_dir.glob('**/index'): for index in self.archived_captures_dir.rglob('index'):
with index.open('r') as _f: with index.open('r') as _f:
archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()} archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
if archived_uuids: if archived_uuids:

View File

@ -200,8 +200,7 @@ class CapturesIndex(Mapping):
time.sleep(5) time.sleep(5)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
har_files = sorted(capture_dir.glob('*.har')) har_files = sorted(capture_dir.glob('*.har*'))
pickle_file = capture_dir / 'tree.pickle'
try: try:
tree = CrawledTree(har_files, uuid) tree = CrawledTree(har_files, uuid)
self.__resolve_dns(tree) self.__resolve_dns(tree)
@ -212,7 +211,7 @@ class CapturesIndex(Mapping):
except RecursionError as e: except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
else: else:
with pickle_file.open('wb') as _p: with (capture_dir / 'tree.pickle').open('wb') as _p:
# Some pickles require a pretty high recursion limit, this kindof fixes it. # Some pickles require a pretty high recursion limit, this kindof fixes it.
# If the capture is really broken (generally a refresh to self), the capture # If the capture is really broken (generally a refresh to self), the capture
# is discarded in the RecursionError above. # is discarded in the RecursionError above.
@ -247,7 +246,7 @@ class CapturesIndex(Mapping):
error_to_cache = content error_to_cache = content
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}' cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
if (har_files := sorted(capture_dir.glob('*.har'))): if (har_files := sorted(capture_dir.glob('*.har*'))):
try: try:
har = HarFile(har_files[0], uuid) har = HarFile(har_files[0], uuid)
cache['title'] = har.initial_title cache['title'] = har.initial_title