diff --git a/bin/archiver.py b/bin/archiver.py index 40ca8a7..551a196 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -5,7 +5,7 @@ from collections import defaultdict import csv from datetime import datetime, timedelta import logging -from typing import Dict, List, Tuple +from typing import Dict, List from pathlib import Path from redis import Redis @@ -28,10 +28,55 @@ class Archiver(AbstractManager): self.archived_captures_dir = get_homedir() / 'archived_captures' self.archived_captures_dir.mkdir(parents=True, exist_ok=True) - self._load_archives() + self._load_indexes() def _to_run_forever(self): self._archive() + self._update_all_capture_indexes() + self._load_indexes() + + def _update_index(self, root_dir: Path) -> None: + current_index: Dict[str, str] + + index_file = root_dir / 'index' + if index_file.exists(): + # Skip index if the directory has been archived. + existing_captures = index_file.parent.iterdir() + with index_file.open('r') as _f: + current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures} + if not current_index: + index_file.unlink() + else: + current_index = {} + + for uuid_file in root_dir.glob('*/uuid'): + if uuid_file.parent.name in current_index.values(): + # The path is already in the index file, no need to read the uuid file + continue + with uuid_file.open() as _f: + current_index[_f.read().strip()] = uuid_file.parent.name + + if not current_index: + # The directory has been archived. + root_dir.unlink() + return + + with index_file.open('w') as _f: + index_writer = csv.writer(_f) + for uuid, dirname in current_index.items(): + index_writer.writerow([uuid, dirname]) + + def _update_all_capture_indexes(self): + '''Run that after the captures are in the proper directories''' + # Recent captures + directories_to_index = set(capture_dir.parent.parent for capture_dir in get_captures_dir().glob('**/uuid')) + for directory_to_index in directories_to_index: + self._update_index(directory_to_index) + + # Archived captures + directories_to_index = set(capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('**/uuid')) + for directory_to_index in directories_to_index: + self._update_index(directory_to_index) def _archive(self): archive_interval = timedelta(days=get_config('generic', 'archive')) @@ -40,54 +85,40 @@ class Archiver(AbstractManager): # Format: # { 2020: { 12: [(directory, uuid)] } } - to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list)) + to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list)) for capture_uuid in get_captures_dir().glob('**/uuid'): timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') if timestamp.date() >= cut_time: - # do not archive. continue - with capture_uuid.open() as _f: - uuid = _f.read().strip() - to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid)) + to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent) self.logger.info(f'Archiving {capture_uuid.parent}.') if not to_archive: self.logger.info('Nothing to archive.') return - archived_uuids = {} + p = self.redis.pipeline() for year, month_captures in to_archive.items(): for month, captures in month_captures.items(): dest_dir = self.archived_captures_dir / str(year) / f'{month:02}' dest_dir.mkdir(parents=True, exist_ok=True) - if (dest_dir / 'index').exists(): - with (dest_dir / 'index').open('r') as _f: - current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)} - else: - current_index = {} - for capture_path, uuid in captures: - current_index[uuid] = capture_path.name + for capture_path in captures: + p.delete(str(capture_path)) capture_path.rename(dest_dir / capture_path.name) - archived_uuids[uuid] = str(dest_dir / capture_path.name) - with (dest_dir / 'index').open('w') as _f: - index_writer = csv.writer(_f) - for uuid, dirname in current_index.items(): - index_writer.writerow([uuid, dirname]) + p.execute() + + # Clear empty - if archived_uuids: - p = self.redis.pipeline() - for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()): - # Clear cache - if dir_key: - p.delete(dir_key) - p.hdel('lookup_dirs', *archived_uuids.keys()) - p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore - p.execute() self.logger.info('Archiving done.') - def _load_archives(self): + def _load_indexes(self): + # Initialize archives + for index in get_captures_dir().glob('**/index'): + with index.open('r') as _f: + recent_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)} + self.redis.hmset('lookup_dirs', recent_uuids) # type: ignore + # Initialize archives - self.redis.delete('lookup_dirs_archived') for index in self.archived_captures_dir.glob('**/index'): with index.open('r') as _f: archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)} @@ -96,7 +127,7 @@ class Archiver(AbstractManager): def main(): a = Archiver() - a.run(sleep_in_sec=3600 * 24) + a.run(sleep_in_sec=3600) if __name__ == '__main__': diff --git a/bin/background_processing.py b/bin/background_processing.py index f88cd3c..5d8d342 100755 --- a/bin/background_processing.py +++ b/bin/background_processing.py @@ -37,7 +37,7 @@ class Processing(AbstractManager): safe_create_dir(self_generated_ua_file_path) self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json' if self_generated_ua_file.exists(): - self.logger.info('User-agent file for {yesterday} already exists.') + self.logger.info(f'User-agent file for {yesterday} already exists.') return self.logger.info(f'Generating user-agent file for {yesterday}') redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) diff --git a/bin/start.py b/bin/start.py index e10d5d3..f0135be 100755 --- a/bin/start.py +++ b/bin/start.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from subprocess import run, Popen -from lookyloo.helpers import get_homedir, get_config, reload_uuids_index +from lookyloo.helpers import get_homedir, get_config def main(): @@ -12,11 +12,8 @@ def main(): p = run(['run_backend', '--start']) p.check_returncode() print('done.') - print('Reload UUIDs index...') - print('If this is taking too long, it means you have a lot of captures.') - print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.') - print('You may also want to archive more captures.') - reload_uuids_index() + print('Start archiving process...') + Popen(['archiver']) print('done.') print('Start asynchronous ingestor...') for _ in range(get_config('generic', 'async_capture_processes')): @@ -28,9 +25,6 @@ def main(): print('Start background processing...') Popen(['processing']) print('done.') - print('Start archiving process...') - Popen(['archiver']) - print('done.') print('Start website...') Popen(['start_website']) print('done.') diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index b10e34b..b66cc19 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -264,21 +264,6 @@ def get_useragent_for_requests(): return f'Lookyloo / {version}' -def reload_uuids_index() -> None: - recent_uuids: Dict[str, str] = {} - for uuid_path in get_captures_dir().glob('**/uuid'): - with uuid_path.open() as f: - uuid = f.read() - recent_uuids[uuid] = str(uuid_path.parent) - if not recent_uuids: - return None - r = Redis(unix_socket_path=get_socket_path('cache')) - p = r.pipeline() - p.delete('lookup_dirs') - p.hmset('lookup_dirs', recent_uuids) # type: ignore - p.execute() - - def get_capture_status(capture_uuid: str, /) -> CaptureStatus: r = Redis(unix_socket_path=get_socket_path('cache')) if r.zrank('to_capture', capture_uuid) is not None: