mirror of https://github.com/CIRCL/lookyloo
105 lines
3.9 KiB
Python
Executable File
105 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from collections import defaultdict
|
|
import csv
|
|
from datetime import datetime, timedelta
|
|
import logging
|
|
from typing import Dict, List, Tuple
|
|
from pathlib import Path
|
|
|
|
from redis import Redis
|
|
|
|
from lookyloo.abstractmanager import AbstractManager
|
|
from lookyloo.helpers import get_config, get_homedir, get_socket_path, get_captures_dir
|
|
|
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
|
level=logging.INFO, datefmt='%I:%M:%S')
|
|
|
|
|
|
class Archiver(AbstractManager):
|
|
|
|
def __init__(self, loglevel: int=logging.INFO):
|
|
super().__init__(loglevel)
|
|
self.script_name = 'archiver'
|
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'))
|
|
|
|
# make sure archived captures dir exists
|
|
self.archived_captures_dir = get_homedir() / 'archived_captures'
|
|
self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self._load_archives()
|
|
|
|
def _to_run_forever(self):
|
|
self._archive()
|
|
|
|
def _archive(self):
|
|
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
|
cut_time = (datetime.now() - archive_interval).date()
|
|
cut_time = cut_time.replace(day=1)
|
|
|
|
# Format:
|
|
# { 2020: { 12: [(directory, uuid)] } }
|
|
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
|
for capture_path in get_captures_dir().glob('*'):
|
|
if not capture_path.is_dir():
|
|
continue
|
|
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
|
if timestamp.date() >= cut_time:
|
|
# do not archive.
|
|
continue
|
|
with (capture_path / 'uuid').open() as _f:
|
|
uuid = _f.read().strip()
|
|
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
|
|
self.logger.info(f'Archiving {capture_path}.')
|
|
|
|
if not to_archive:
|
|
self.logger.info('Nothing to archive.')
|
|
return
|
|
|
|
archived_uuids = {}
|
|
for year, month_captures in to_archive.items():
|
|
for month, captures in month_captures.items():
|
|
dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
|
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
if (dest_dir / 'index').exists():
|
|
with (dest_dir / 'index').open('r') as _f:
|
|
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)}
|
|
else:
|
|
current_index = {}
|
|
for capture_path, uuid in captures:
|
|
current_index[uuid] = capture_path.name
|
|
capture_path.rename(dest_dir / capture_path.name)
|
|
archived_uuids[uuid] = str(dest_dir / capture_path.name)
|
|
with (dest_dir / 'index').open('w') as _f:
|
|
index_writer = csv.writer(_f)
|
|
for uuid, dirname in current_index.items():
|
|
index_writer.writerow([uuid, dirname])
|
|
|
|
if archived_uuids:
|
|
p = self.redis.pipeline()
|
|
p.hdel('lookup_dirs', *archived_uuids.keys())
|
|
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
|
p.execute()
|
|
self.logger.info('Archiving done.')
|
|
|
|
def _load_archives(self):
|
|
# Initialize archives
|
|
self.redis.delete('lookup_dirs_archived')
|
|
for year in self.archived_captures_dir.iterdir():
|
|
for month in year.iterdir():
|
|
if not (month / 'index').exists():
|
|
continue
|
|
with (month / 'index').open('r') as _f:
|
|
archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
|
|
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
|
|
|
|
|
def main():
|
|
a = Archiver()
|
|
a.run(sleep_in_sec=3600 * 24)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|