From c203aa91b95d1bb68c16e65f7ad11eda91bc4605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 4 Aug 2023 14:02:45 +0200 Subject: [PATCH] chg: Avoid directory listing as much as possible in archiver, allow shutdown --- bin/archiver.py | 42 ++++++++++++++++++++++++++++++--------- bin/background_indexer.py | 2 ++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/bin/archiver.py b/bin/archiver.py index c15cd271..f8eed940 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -8,7 +8,7 @@ import shutil from collections import defaultdict from collections.abc import Mapping -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date from pathlib import Path from typing import Dict, List, Optional @@ -88,6 +88,9 @@ class Archiver(AbstractManager): self.logger.info('Update archives indexes') directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.glob('*/*/*/uuid')} for directory_to_index in directories_to_index: + if self.shutdown_requested(): + self.logger.warning('Shutdown requested, breaking.') + break self.logger.debug(f'Updating index for {directory_to_index}') self._update_index(directory_to_index) self.logger.info('Archived indexes updated') @@ -100,15 +103,26 @@ class Archiver(AbstractManager): # Format: # { 2020: { 12: [(directory, uuid)] } } to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list)) - for capture_uuid in get_captures_dir().glob('*/*/*/uuid'): - try: - timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') - except ValueError: - timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S') - if timestamp.date() >= cut_time: + # In order to avoid scanning the complete directory on each run, we check if year and month are + # older than the cut time. + for index in get_captures_dir().glob('*/*/index'): + if self.shutdown_requested(): + self.logger.warning('Shutdown requested, breaking.') + break + month = int(index.parent.name) + year = int(index.parent.parent.name) + if date(year, month, 1) >= cut_time: continue - to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent) - self.logger.info(f'Archiving {capture_uuid.parent}.') + + for capture_uuid in index.parent.glob('*/uuid'): + try: + timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') + except ValueError: + timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S') + if timestamp.date() >= cut_time: + continue + to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent) + self.logger.info(f'Archiving {capture_uuid.parent}.') if not to_archive: self.logger.info('Nothing to archive.') @@ -131,6 +145,9 @@ class Archiver(AbstractManager): def _compress_hars(self): self.logger.info('Compressing archived captures') for index in self.archived_captures_dir.glob('*/*/index'): + if self.shutdown_requested(): + self.logger.warning('Shutdown requested, breaking.') + break with index.open('r') as _f: for uuid, dirname in csv.reader(_f): for har in (index.parent / dirname).rglob('*.har'): @@ -145,6 +162,10 @@ class Archiver(AbstractManager): def _load_indexes(self): # Initialize archives for index in get_captures_dir().glob('*/*/index'): + if self.shutdown_requested(): + self.logger.warning('Shutdown requested, breaking.') + break + with index.open('r') as _f: recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()} if recent_uuids: @@ -155,6 +176,9 @@ class Archiver(AbstractManager): # Initialize archives for index in self.archived_captures_dir.glob('*/*/index'): + if self.shutdown_requested(): + self.logger.warning('Shutdown requested, breaking.') + break with index.open('r') as _f: archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()} if archived_uuids: diff --git a/bin/background_indexer.py b/bin/background_indexer.py index b543ec79..c37c8e57 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -84,6 +84,8 @@ class BackgroundIndexer(AbstractManager): self.logger.warning(f'Unable to find {uuid}. That should not happen.') except NoValidHarFile as e: self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}') + except FileNotFoundError: + self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.') except Exception: self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') # The capture is not working, moving it away.