chg: Make archiver an index generator

pull/254/head
Raphaël Vinot 2021-08-30 12:48:13 +02:00
parent 1bff8f1529
commit 117500b777
4 changed files with 67 additions and 57 deletions

View File

@ -5,7 +5,7 @@ from collections import defaultdict
import csv
from datetime import datetime, timedelta
import logging
from typing import Dict, List, Tuple
from typing import Dict, List
from pathlib import Path
from redis import Redis
@ -28,10 +28,55 @@ class Archiver(AbstractManager):
self.archived_captures_dir = get_homedir() / 'archived_captures'
self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
self._load_archives()
self._load_indexes()
def _to_run_forever(self):
self._archive()
self._update_all_capture_indexes()
self._load_indexes()
def _update_index(self, root_dir: Path) -> None:
current_index: Dict[str, str]
index_file = root_dir / 'index'
if index_file.exists():
# Skip index if the directory has been archived.
existing_captures = index_file.parent.iterdir()
with index_file.open('r') as _f:
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures}
if not current_index:
index_file.unlink()
else:
current_index = {}
for uuid_file in root_dir.glob('*/uuid'):
if uuid_file.parent.name in current_index.values():
# The path is already in the index file, no need to read the uuid file
continue
with uuid_file.open() as _f:
current_index[_f.read().strip()] = uuid_file.parent.name
if not current_index:
# The directory has been archived.
root_dir.unlink()
return
with index_file.open('w') as _f:
index_writer = csv.writer(_f)
for uuid, dirname in current_index.items():
index_writer.writerow([uuid, dirname])
def _update_all_capture_indexes(self):
'''Run that after the captures are in the proper directories'''
# Recent captures
directories_to_index = set(capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid'))
for directory_to_index in directories_to_index:
self._update_index(directory_to_index)
# Archived captures
directories_to_index = set(capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid'))
for directory_to_index in directories_to_index:
self._update_index(directory_to_index)
def _archive(self):
archive_interval = timedelta(days=get_config('generic', 'archive'))
@ -40,54 +85,40 @@ class Archiver(AbstractManager):
# Format:
# { 2020: { 12: [(directory, uuid)] } }
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
for capture_uuid in get_captures_dir().glob('**/uuid'):
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
if timestamp.date() >= cut_time:
# do not archive.
continue
with capture_uuid.open() as _f:
uuid = _f.read().strip()
to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid))
to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
self.logger.info(f'Archiving {capture_uuid.parent}.')
if not to_archive:
self.logger.info('Nothing to archive.')
return
archived_uuids = {}
p = self.redis.pipeline()
for year, month_captures in to_archive.items():
for month, captures in month_captures.items():
dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
dest_dir.mkdir(parents=True, exist_ok=True)
if (dest_dir / 'index').exists():
with (dest_dir / 'index').open('r') as _f:
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)}
else:
current_index = {}
for capture_path, uuid in captures:
current_index[uuid] = capture_path.name
for capture_path in captures:
p.delete(str(capture_path))
capture_path.rename(dest_dir / capture_path.name)
archived_uuids[uuid] = str(dest_dir / capture_path.name)
with (dest_dir / 'index').open('w') as _f:
index_writer = csv.writer(_f)
for uuid, dirname in current_index.items():
index_writer.writerow([uuid, dirname])
p.execute()
# Clear empty
if archived_uuids:
p = self.redis.pipeline()
for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()):
# Clear cache
if dir_key:
p.delete(dir_key)
p.hdel('lookup_dirs', *archived_uuids.keys())
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
p.execute()
self.logger.info('Archiving done.')
def _load_archives(self):
def _load_indexes(self):
# Initialize archives
for index in get_captures_dir().glob('**/index'):
with index.open('r') as _f:
recent_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
self.redis.hmset('lookup_dirs', recent_uuids) # type: ignore
# Initialize archives
self.redis.delete('lookup_dirs_archived')
for index in self.archived_captures_dir.glob('**/index'):
with index.open('r') as _f:
archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
@ -96,7 +127,7 @@ class Archiver(AbstractManager):
def main():
a = Archiver()
a.run(sleep_in_sec=3600 * 24)
a.run(sleep_in_sec=3600)
if __name__ == '__main__':

View File

@ -37,7 +37,7 @@ class Processing(AbstractManager):
safe_create_dir(self_generated_ua_file_path)
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
if self_generated_ua_file.exists():
self.logger.info('User-agent file for {yesterday} already exists.')
self.logger.info(f'User-agent file for {yesterday} already exists.')
return
self.logger.info(f'Generating user-agent file for {yesterday}')
redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
from subprocess import run, Popen
from lookyloo.helpers import get_homedir, get_config, reload_uuids_index
from lookyloo.helpers import get_homedir, get_config
def main():
@ -12,11 +12,8 @@ def main():
p = run(['run_backend', '--start'])
p.check_returncode()
print('done.')
print('Reload UUIDs index...')
print('If this is taking too long, it means you have a lot of captures.')
print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.')
print('You may also want to archive more captures.')
reload_uuids_index()
print('Start archiving process...')
Popen(['archiver'])
print('done.')
print('Start asynchronous ingestor...')
for _ in range(get_config('generic', 'async_capture_processes')):
@ -28,9 +25,6 @@ def main():
print('Start background processing...')
Popen(['processing'])
print('done.')
print('Start archiving process...')
Popen(['archiver'])
print('done.')
print('Start website...')
Popen(['start_website'])
print('done.')

View File

@ -264,21 +264,6 @@ def get_useragent_for_requests():
return f'Lookyloo / {version}'
def reload_uuids_index() -> None:
recent_uuids: Dict[str, str] = {}
for uuid_path in get_captures_dir().glob('**/uuid'):
with uuid_path.open() as f:
uuid = f.read()
recent_uuids[uuid] = str(uuid_path.parent)
if not recent_uuids:
return None
r = Redis(unix_socket_path=get_socket_path('cache'))
p = r.pipeline()
p.delete('lookup_dirs')
p.hmset('lookup_dirs', recent_uuids) # type: ignore
p.execute()
def get_capture_status(capture_uuid: str, /) -> CaptureStatus:
r = Redis(unix_socket_path=get_socket_path('cache'))
if r.zrank('to_capture', capture_uuid) is not None: