lookyloo/bin/archiver.py

175 lines
7.0 KiB
Python
Raw Normal View History

2021-08-20 17:46:22 +02:00
#!/usr/bin/env python3
import csv
import gzip
2021-08-20 17:46:22 +02:00
import logging
2022-11-23 15:54:22 +01:00
import logging.config
import shutil
2021-09-07 12:59:31 +02:00
from collections import defaultdict
2022-05-23 00:15:52 +02:00
from collections.abc import Mapping
2021-09-07 12:59:31 +02:00
from datetime import datetime, timedelta
2021-08-20 17:46:22 +02:00
from pathlib import Path
2021-09-07 12:59:31 +02:00
from typing import Dict, List
2021-08-20 17:46:22 +02:00
from redis import Redis
2021-10-18 13:06:43 +02:00
from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path
from lookyloo.helpers import get_captures_dir
2021-08-20 17:46:22 +02:00
2022-11-23 15:54:22 +01:00
logging.config.dictConfig(get_config('logging'))
2021-08-20 17:46:22 +02:00
class Archiver(AbstractManager):
def __init__(self, loglevel: int=logging.INFO):
super().__init__(loglevel)
self.script_name = 'archiver'
self.redis = Redis(unix_socket_path=get_socket_path('cache'))
# make sure archived captures dir exists
self.archived_captures_dir = get_homedir() / 'archived_captures'
self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
2021-08-30 12:48:13 +02:00
self._load_indexes()
2021-08-20 17:46:22 +02:00
def _to_run_forever(self):
self._archive()
2021-08-30 12:48:13 +02:00
self._update_all_capture_indexes()
self._load_indexes()
self._compress_hars()
2021-08-30 12:48:13 +02:00
def _update_index(self, root_dir: Path) -> None:
current_index: Dict[str, str] = {}
2021-08-30 12:48:13 +02:00
index_file = root_dir / 'index'
if index_file.exists():
# Skip index if the directory has been archived.
existing_captures = index_file.parent.iterdir()
try:
with index_file.open('r') as _f:
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures}
2022-09-23 14:32:42 +02:00
except Exception as e:
# the index file is broken, it will be recreated.
2022-09-23 14:32:42 +02:00
self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')
pass
2021-08-30 12:48:13 +02:00
if not current_index:
index_file.unlink()
for uuid_file in root_dir.glob('*/uuid'):
if uuid_file.parent.name in current_index.values():
# The path is already in the index file, no need to read the uuid file
continue
with uuid_file.open() as _f:
current_index[_f.read().strip()] = uuid_file.parent.name
if not current_index:
# The directory has been archived. It is probably safe to unlink, but
# if it's not, we will lose a whole buch of captures. Moving instead for safety.
root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name)
2021-08-30 12:48:13 +02:00
return
with index_file.open('w') as _f:
index_writer = csv.writer(_f)
for uuid, dirname in current_index.items():
index_writer.writerow([uuid, dirname])
def _update_all_capture_indexes(self):
'''Run that after the captures are in the proper directories'''
# Recent captures
self.logger.info('Update recent indexes')
directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}
2021-08-30 12:48:13 +02:00
for directory_to_index in directories_to_index:
self.logger.debug(f'Updating index for {directory_to_index}')
2021-08-30 12:48:13 +02:00
self._update_index(directory_to_index)
2022-07-27 14:33:28 +02:00
self.logger.info('Recent indexes updated')
2021-08-30 12:48:13 +02:00
# Archived captures
self.logger.info('Update archives indexes')
directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}
2021-08-30 12:48:13 +02:00
for directory_to_index in directories_to_index:
self.logger.debug(f'Updating index for {directory_to_index}')
2021-08-30 12:48:13 +02:00
self._update_index(directory_to_index)
2022-07-27 14:33:28 +02:00
self.logger.info('Archived indexes updated')
2021-08-20 17:46:22 +02:00
def _archive(self):
archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval).date()
2021-08-23 15:51:06 +02:00
cut_time = cut_time.replace(day=1)
2021-08-20 17:46:22 +02:00
# Format:
# { 2020: { 12: [(directory, uuid)] } }
2021-08-30 12:48:13 +02:00
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
for capture_uuid in get_captures_dir().rglob('uuid'):
try:
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
except ValueError:
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')
2021-08-23 15:51:06 +02:00
if timestamp.date() >= cut_time:
2021-08-20 17:46:22 +02:00
continue
2021-08-30 12:48:13 +02:00
to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
self.logger.info(f'Archiving {capture_uuid.parent}.')
if not to_archive:
self.logger.info('Nothing to archive.')
return
2021-08-20 17:46:22 +02:00
2021-08-30 12:48:13 +02:00
p = self.redis.pipeline()
2021-08-20 17:46:22 +02:00
for year, month_captures in to_archive.items():
for month, captures in month_captures.items():
dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
2021-08-20 17:46:22 +02:00
dest_dir.mkdir(parents=True, exist_ok=True)
2021-08-30 12:48:13 +02:00
for capture_path in captures:
p.delete(str(capture_path))
(capture_path / 'tree.pickle').unlink(missing_ok=True)
2023-03-16 12:28:28 +01:00
(capture_path / 'tree.pickle.gz').unlink(missing_ok=True)
2021-08-20 17:46:22 +02:00
capture_path.rename(dest_dir / capture_path.name)
2021-08-30 12:48:13 +02:00
p.execute()
self.logger.info('Archiving done.')
2021-08-20 17:46:22 +02:00
def _compress_hars(self):
2022-07-27 14:33:28 +02:00
self.logger.info('Compressing archived captures')
for index in self.archived_captures_dir.rglob('index'):
with index.open('r') as _f:
for uuid, dirname in csv.reader(_f):
for har in (index.parent / dirname).rglob('*.har'):
if not har.exists():
continue
with har.open('rb') as f_in:
with gzip.open(f'{har}.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
har.unlink()
2022-07-27 14:33:28 +02:00
self.logger.info('Archived captures compressed')
2021-08-30 12:48:13 +02:00
def _load_indexes(self):
# Initialize archives
for index in get_captures_dir().rglob('index'):
2021-08-30 12:48:13 +02:00
with index.open('r') as _f:
2022-05-23 00:15:52 +02:00
recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
if recent_uuids:
2022-05-23 00:15:52 +02:00
self.redis.hset('lookup_dirs', mapping=recent_uuids)
else:
index.unlink()
2022-07-27 14:33:28 +02:00
self.logger.info('Recent indexes loaded')
2021-08-30 12:48:13 +02:00
# Initialize archives
for index in self.archived_captures_dir.rglob('index'):
with index.open('r') as _f:
2022-05-23 00:15:52 +02:00
archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
if archived_uuids:
2022-05-23 00:15:52 +02:00
self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
2022-07-27 14:33:28 +02:00
self.redis.hdel('lookup_dirs', *archived_uuids.keys())
else:
index.unlink()
2022-07-27 14:33:28 +02:00
self.logger.info('Archived indexes loaded')
2021-08-23 15:14:08 +02:00
2021-08-20 17:46:22 +02:00
def main():
a = Archiver()
2021-08-30 12:48:13 +02:00
a.run(sleep_in_sec=3600)
2021-08-20 17:46:22 +02:00
if __name__ == '__main__':
main()