mirror of https://github.com/CIRCL/lookyloo
new: compress HAR files in archived captures.
parent
2caa19aec0
commit
5f329e4d7b
|
@ -1,7 +1,10 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
import gzip
|
||||||
import logging
|
import logging
|
||||||
|
import shutil
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
@ -34,6 +37,7 @@ class Archiver(AbstractManager):
|
||||||
self._archive()
|
self._archive()
|
||||||
self._update_all_capture_indexes()
|
self._update_all_capture_indexes()
|
||||||
self._load_indexes()
|
self._load_indexes()
|
||||||
|
self._compress_hars()
|
||||||
|
|
||||||
def _update_index(self, root_dir: Path) -> None:
|
def _update_index(self, root_dir: Path) -> None:
|
||||||
current_index: Dict[str, str] = {}
|
current_index: Dict[str, str] = {}
|
||||||
|
@ -72,12 +76,12 @@ class Archiver(AbstractManager):
|
||||||
def _update_all_capture_indexes(self):
|
def _update_all_capture_indexes(self):
|
||||||
'''Run that after the captures are in the proper directories'''
|
'''Run that after the captures are in the proper directories'''
|
||||||
# Recent captures
|
# Recent captures
|
||||||
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid')}
|
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}
|
||||||
for directory_to_index in directories_to_index:
|
for directory_to_index in directories_to_index:
|
||||||
self._update_index(directory_to_index)
|
self._update_index(directory_to_index)
|
||||||
|
|
||||||
# Archived captures
|
# Archived captures
|
||||||
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid')}
|
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}
|
||||||
for directory_to_index in directories_to_index:
|
for directory_to_index in directories_to_index:
|
||||||
self._update_index(directory_to_index)
|
self._update_index(directory_to_index)
|
||||||
|
|
||||||
|
@ -89,7 +93,7 @@ class Archiver(AbstractManager):
|
||||||
# Format:
|
# Format:
|
||||||
# { 2020: { 12: [(directory, uuid)] } }
|
# { 2020: { 12: [(directory, uuid)] } }
|
||||||
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
|
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
|
||||||
for capture_uuid in get_captures_dir().glob('**/uuid'):
|
for capture_uuid in get_captures_dir().rglob('uuid'):
|
||||||
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||||
if timestamp.date() >= cut_time:
|
if timestamp.date() >= cut_time:
|
||||||
continue
|
continue
|
||||||
|
@ -107,16 +111,27 @@ class Archiver(AbstractManager):
|
||||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
for capture_path in captures:
|
for capture_path in captures:
|
||||||
p.delete(str(capture_path))
|
p.delete(str(capture_path))
|
||||||
|
(capture_path / 'tree.pickle').unlink(missing_ok=True)
|
||||||
capture_path.rename(dest_dir / capture_path.name)
|
capture_path.rename(dest_dir / capture_path.name)
|
||||||
p.execute()
|
p.execute()
|
||||||
|
|
||||||
# Clear empty
|
|
||||||
|
|
||||||
self.logger.info('Archiving done.')
|
self.logger.info('Archiving done.')
|
||||||
|
|
||||||
|
def _compress_hars(self):
|
||||||
|
for index in self.archived_captures_dir.rglob('index'):
|
||||||
|
with index.open('r') as _f:
|
||||||
|
for uuid, dirname in csv.reader(_f):
|
||||||
|
for har in (index.parent / dirname).glob('*.har'):
|
||||||
|
if not har.exists():
|
||||||
|
continue
|
||||||
|
with har.open('rb') as f_in:
|
||||||
|
with gzip.open(f'{har}.gz', 'wb') as f_out:
|
||||||
|
shutil.copyfileobj(f_in, f_out)
|
||||||
|
har.unlink()
|
||||||
|
|
||||||
def _load_indexes(self):
|
def _load_indexes(self):
|
||||||
# Initialize archives
|
# Initialize archives
|
||||||
for index in get_captures_dir().glob('**/index'):
|
for index in get_captures_dir().rglob('index'):
|
||||||
with index.open('r') as _f:
|
with index.open('r') as _f:
|
||||||
recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
|
recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
|
||||||
if recent_uuids:
|
if recent_uuids:
|
||||||
|
@ -125,7 +140,7 @@ class Archiver(AbstractManager):
|
||||||
index.unlink()
|
index.unlink()
|
||||||
|
|
||||||
# Initialize archives
|
# Initialize archives
|
||||||
for index in self.archived_captures_dir.glob('**/index'):
|
for index in self.archived_captures_dir.rglob('index'):
|
||||||
with index.open('r') as _f:
|
with index.open('r') as _f:
|
||||||
archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
|
archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
|
||||||
if archived_uuids:
|
if archived_uuids:
|
||||||
|
|
|
@ -200,8 +200,7 @@ class CapturesIndex(Mapping):
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||||
|
|
||||||
har_files = sorted(capture_dir.glob('*.har'))
|
har_files = sorted(capture_dir.glob('*.har*'))
|
||||||
pickle_file = capture_dir / 'tree.pickle'
|
|
||||||
try:
|
try:
|
||||||
tree = CrawledTree(har_files, uuid)
|
tree = CrawledTree(har_files, uuid)
|
||||||
self.__resolve_dns(tree)
|
self.__resolve_dns(tree)
|
||||||
|
@ -212,7 +211,7 @@ class CapturesIndex(Mapping):
|
||||||
except RecursionError as e:
|
except RecursionError as e:
|
||||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||||
else:
|
else:
|
||||||
with pickle_file.open('wb') as _p:
|
with (capture_dir / 'tree.pickle').open('wb') as _p:
|
||||||
# Some pickles require a pretty high recursion limit, this kindof fixes it.
|
# Some pickles require a pretty high recursion limit, this kindof fixes it.
|
||||||
# If the capture is really broken (generally a refresh to self), the capture
|
# If the capture is really broken (generally a refresh to self), the capture
|
||||||
# is discarded in the RecursionError above.
|
# is discarded in the RecursionError above.
|
||||||
|
@ -247,7 +246,7 @@ class CapturesIndex(Mapping):
|
||||||
error_to_cache = content
|
error_to_cache = content
|
||||||
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
||||||
|
|
||||||
if (har_files := sorted(capture_dir.glob('*.har'))):
|
if (har_files := sorted(capture_dir.glob('*.har*'))):
|
||||||
try:
|
try:
|
||||||
har = HarFile(har_files[0], uuid)
|
har = HarFile(har_files[0], uuid)
|
||||||
cache['title'] = har.initial_title
|
cache['title'] = har.initial_title
|
||||||
|
|
Loading…
Reference in New Issue