mirror of https://github.com/CIRCL/lookyloo
103 lines
4.1 KiB
Python
Executable File
103 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from collections import defaultdict
|
|
import csv
|
|
from datetime import datetime, timedelta
|
|
import logging
|
|
from typing import Dict, List, Tuple
|
|
from pathlib import Path
|
|
|
|
from lookyloo.abstractmanager import AbstractManager
|
|
from lookyloo.lookyloo import Lookyloo
|
|
from lookyloo.helpers import get_config
|
|
|
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
|
level=logging.INFO, datefmt='%I:%M:%S')
|
|
|
|
|
|
class Archiver(AbstractManager):
|
|
|
|
def __init__(self, loglevel: int=logging.INFO):
|
|
super().__init__(loglevel)
|
|
self.script_name = 'archiver'
|
|
self._load_indexes()
|
|
|
|
def _to_run_forever(self):
|
|
self._archive()
|
|
|
|
def _archive(self):
|
|
# Initialize the lookyloo class here, no need to keep it in memory all the time.
|
|
lookyloo = Lookyloo()
|
|
# make sure archived captures dir exists
|
|
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
|
|
archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
|
cut_time = datetime.now() - archive_interval
|
|
|
|
# Format:
|
|
# { 2020: { 12: [(directory, uuid)] } }
|
|
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
|
for capture_path in lookyloo.capture_dir.glob('*'):
|
|
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
|
if timestamp >= cut_time:
|
|
# do not archive.
|
|
continue
|
|
with (capture_path / 'uuid').open() as _f:
|
|
uuid = _f.read().strip()
|
|
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
|
|
self.logger.info(f'Archiving {capture_path}.')
|
|
|
|
if not to_archive:
|
|
self.logger.info('Nothing to archive.')
|
|
return
|
|
|
|
archived_uuids = {}
|
|
for year, month_captures in to_archive.items():
|
|
for month, captures in month_captures.items():
|
|
dest_dir = archived_captures_dir / str(year) / f'{month:02}'
|
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
if (dest_dir / 'index').exists():
|
|
with (dest_dir / 'index').open('r') as _f:
|
|
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)}
|
|
else:
|
|
current_index = {}
|
|
for capture_path, uuid in captures:
|
|
current_index[uuid] = capture_path.name
|
|
capture_path.rename(dest_dir / capture_path.name)
|
|
archived_uuids[uuid] = str(dest_dir / capture_path.name)
|
|
with (dest_dir / 'index').open('w') as _f:
|
|
index_writer = csv.writer(_f)
|
|
for uuid, dirname in current_index.items():
|
|
index_writer.writerow([uuid, dirname])
|
|
|
|
if archived_uuids:
|
|
lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
|
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
|
lookyloo.clear_captures_index_cache(archived_uuids.keys())
|
|
self.logger.info('Archiving done.')
|
|
|
|
def _load_indexes(self):
|
|
# Initialize the lookyloo class here, no need to keep it in memory all the time.
|
|
lookyloo = Lookyloo()
|
|
# make sure archived captures dir exists
|
|
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
|
|
archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
for year in archived_captures_dir.iterdir():
|
|
for month in year.iterdir():
|
|
if not (month / 'index').exists():
|
|
continue
|
|
with (month / 'index').open('r') as _f:
|
|
archived_uuids = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
|
|
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
|
lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
|
|
|
|
|
def main():
|
|
a = Archiver()
|
|
a.run(sleep_in_sec=3600 * 24)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|