mirror of https://github.com/CIRCL/lookyloo
new: compress HAR files in archived captures.
parent
2caa19aec0
commit
5f329e4d7b
|
@ -1,7 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import csv
|
||||
import gzip
|
||||
import logging
|
||||
import shutil
|
||||
|
||||
from collections import defaultdict
|
||||
from collections.abc import Mapping
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -34,6 +37,7 @@ class Archiver(AbstractManager):
|
|||
self._archive()
|
||||
self._update_all_capture_indexes()
|
||||
self._load_indexes()
|
||||
self._compress_hars()
|
||||
|
||||
def _update_index(self, root_dir: Path) -> None:
|
||||
current_index: Dict[str, str] = {}
|
||||
|
@ -72,12 +76,12 @@ class Archiver(AbstractManager):
|
|||
def _update_all_capture_indexes(self):
|
||||
'''Run that after the captures are in the proper directories'''
|
||||
# Recent captures
|
||||
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid')}
|
||||
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}
|
||||
for directory_to_index in directories_to_index:
|
||||
self._update_index(directory_to_index)
|
||||
|
||||
# Archived captures
|
||||
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid')}
|
||||
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}
|
||||
for directory_to_index in directories_to_index:
|
||||
self._update_index(directory_to_index)
|
||||
|
||||
|
@ -89,7 +93,7 @@ class Archiver(AbstractManager):
|
|||
# Format:
|
||||
# { 2020: { 12: [(directory, uuid)] } }
|
||||
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
|
||||
for capture_uuid in get_captures_dir().glob('**/uuid'):
|
||||
for capture_uuid in get_captures_dir().rglob('uuid'):
|
||||
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||
if timestamp.date() >= cut_time:
|
||||
continue
|
||||
|
@ -107,16 +111,27 @@ class Archiver(AbstractManager):
|
|||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
for capture_path in captures:
|
||||
p.delete(str(capture_path))
|
||||
(capture_path / 'tree.pickle').unlink(missing_ok=True)
|
||||
capture_path.rename(dest_dir / capture_path.name)
|
||||
p.execute()
|
||||
|
||||
# Clear empty
|
||||
|
||||
self.logger.info('Archiving done.')
|
||||
|
||||
def _compress_hars(self):
|
||||
for index in self.archived_captures_dir.rglob('index'):
|
||||
with index.open('r') as _f:
|
||||
for uuid, dirname in csv.reader(_f):
|
||||
for har in (index.parent / dirname).glob('*.har'):
|
||||
if not har.exists():
|
||||
continue
|
||||
with har.open('rb') as f_in:
|
||||
with gzip.open(f'{har}.gz', 'wb') as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
har.unlink()
|
||||
|
||||
def _load_indexes(self):
|
||||
# Initialize archives
|
||||
for index in get_captures_dir().glob('**/index'):
|
||||
for index in get_captures_dir().rglob('index'):
|
||||
with index.open('r') as _f:
|
||||
recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
|
||||
if recent_uuids:
|
||||
|
@ -125,7 +140,7 @@ class Archiver(AbstractManager):
|
|||
index.unlink()
|
||||
|
||||
# Initialize archives
|
||||
for index in self.archived_captures_dir.glob('**/index'):
|
||||
for index in self.archived_captures_dir.rglob('index'):
|
||||
with index.open('r') as _f:
|
||||
archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
|
||||
if archived_uuids:
|
||||
|
|
|
@ -200,8 +200,7 @@ class CapturesIndex(Mapping):
|
|||
time.sleep(5)
|
||||
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
har_files = sorted(capture_dir.glob('*.har*'))
|
||||
try:
|
||||
tree = CrawledTree(har_files, uuid)
|
||||
self.__resolve_dns(tree)
|
||||
|
@ -212,7 +211,7 @@ class CapturesIndex(Mapping):
|
|||
except RecursionError as e:
|
||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||
else:
|
||||
with pickle_file.open('wb') as _p:
|
||||
with (capture_dir / 'tree.pickle').open('wb') as _p:
|
||||
# Some pickles require a pretty high recursion limit, this kindof fixes it.
|
||||
# If the capture is really broken (generally a refresh to self), the capture
|
||||
# is discarded in the RecursionError above.
|
||||
|
@ -247,7 +246,7 @@ class CapturesIndex(Mapping):
|
|||
error_to_cache = content
|
||||
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
||||
|
||||
if (har_files := sorted(capture_dir.glob('*.har'))):
|
||||
if (har_files := sorted(capture_dir.glob('*.har*'))):
|
||||
try:
|
||||
har = HarFile(har_files[0], uuid)
|
||||
cache['title'] = har.initial_title
|
||||
|
|
Loading…
Reference in New Issue