fix: Only rewrite the index files if there are changes.

pull/923/head
Raphaël Vinot 2024-06-24 15:24:55 +02:00
parent 37e92dbaf1
commit 994f70adcf
1 changed files with 48 additions and 18 deletions

View File

@ -79,7 +79,11 @@ class Archiver(AbstractManager):
logmsg = f'{logmsg} (s3fs)' logmsg = f'{logmsg} (s3fs)'
self.logger.info(logmsg) self.logger.info(logmsg)
# Flip that variable is we need to write the index
rewrite_index: bool = False
current_index: dict[str, str] = {} current_index: dict[str, str] = {}
current_sub_index: set[str] = set()
index_file = root_dir / 'index' index_file = root_dir / 'index'
if index_file.exists(): if index_file.exists():
try: try:
@ -87,12 +91,17 @@ class Archiver(AbstractManager):
except Exception as e: except Exception as e:
# the index file is broken, it will be recreated. # the index file is broken, it will be recreated.
self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}') self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')
if not current_index:
# The file is either empty or only contains subs # Check if we have sub_index entries, they're skipped from the call above.
# NOTE: should we remove if it has subs? with index_file.open() as _i:
for key, path_name in csv.reader(_i):
if key == 'sub_index':
current_sub_index.add(path_name)
if not current_index and not current_sub_index:
# The file is empty
index_file.unlink() index_file.unlink()
sub_indexes: list[Path] = []
current_index_dirs: set[str] = set(current_index.values()) current_index_dirs: set[str] = set(current_index.values())
new_captures: set[Path] = set() new_captures: set[Path] = set()
# Directories that are actually in the listing. # Directories that are actually in the listing.
@ -112,9 +121,13 @@ class Archiver(AbstractManager):
continue continue
dir_on_disk = root_dir / entry.rsplit('/', 1)[-1] dir_on_disk = root_dir / entry.rsplit('/', 1)[-1]
if dir_on_disk.name.isdigit(): if dir_on_disk.name.isdigit():
if sub_index := self._update_index(dir_on_disk, s3fs_parent_dir=s3fs_dir): if self._update_index(dir_on_disk, s3fs_parent_dir=s3fs_dir):
# got a day directory that contains captures # got a day directory that contains captures
sub_indexes.append(sub_index) if dir_on_disk.name not in current_sub_index:
# ... and it's not in the index
rewrite_index = True
current_sub_index.add(dir_on_disk.name)
self.logger.info(f'Adding sub index {dir_on_disk.name} to {index_file}')
else: else:
# got a capture # got a capture
if len(self.s3fs_client.ls(entry, detail=False)) == 1: if len(self.s3fs_client.ls(entry, detail=False)) == 1:
@ -135,9 +148,13 @@ class Archiver(AbstractManager):
continue continue
dir_on_disk = Path(entry) dir_on_disk = Path(entry)
if dir_on_disk.name.isdigit(): if dir_on_disk.name.isdigit():
if sub_index := self._update_index(dir_on_disk): if self._update_index(dir_on_disk):
# got a day directory that contains captures # got a day directory that contains captures
sub_indexes.append(sub_index) if dir_on_disk.name not in current_sub_index:
# ... and it's not in the index
rewrite_index = True
current_sub_index.add(dir_on_disk.name)
self.logger.info(f'Adding sub index {dir_on_disk.name} to {index_file}')
else: else:
# isoformat # isoformat
if str(dir_on_disk) not in current_index_dirs: if str(dir_on_disk) not in current_index_dirs:
@ -150,8 +167,15 @@ class Archiver(AbstractManager):
if non_existing_dirs := current_index_dirs - current_dirs: if non_existing_dirs := current_index_dirs - current_dirs:
self.logger.info(f'Got {len(non_existing_dirs)} non existing dirs in {root_dir}, removing them from the index.') self.logger.info(f'Got {len(non_existing_dirs)} non existing dirs in {root_dir}, removing them from the index.')
current_index = {uuid: Path(path).name for uuid, path in current_index.items() if path not in non_existing_dirs} current_index = {uuid: Path(path).name for uuid, path in current_index.items() if path not in non_existing_dirs}
rewrite_index = True
if not current_index and not new_captures and not sub_indexes: # Make sure all the sub_index directories exist on the disk
if old_subindexes := {sub_index for sub_index in current_sub_index if sub_index not in current_dirs}:
self.logger.warning(f'Sub index {', '.join(old_subindexes)} do not exist, removing them from the index.')
rewrite_index = True
current_sub_index -= old_subindexes
if not current_index and not new_captures and not current_sub_index:
# No captures at all in the directory and subdirectories, quitting # No captures at all in the directory and subdirectories, quitting
logmsg = f'No captures in {root_dir}' logmsg = f'No captures in {root_dir}'
if s3fs_parent_dir: if s3fs_parent_dir:
@ -198,22 +222,25 @@ class Archiver(AbstractManager):
except OSError as e: except OSError as e:
self.logger.warning(f'Error when discarding capture {capture_dir}: {e}') self.logger.warning(f'Error when discarding capture {capture_dir}: {e}')
continue continue
rewrite_index = True
current_index[uuid] = capture_dir.name current_index[uuid] = capture_dir.name
if not current_index and not sub_indexes: if not current_index and not current_sub_index:
# The directory has been archived. It is probably safe to unlink, but # The directory has been archived. It is probably safe to unlink, but
# if it's not, we will lose a whole buch of captures. Moving instead for safety. # if it's not, we will lose a whole buch of captures. Moving instead for safety.
shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name)) shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name))
self.logger.warning(f'Nothing to index in {root_dir}') self.logger.warning(f'Nothing to index in {root_dir}')
return None return None
with index_file.open('w') as _f: if rewrite_index:
index_writer = csv.writer(_f) self.logger.info(f'Writing index {index_file}.')
for uuid, dirname in current_index.items(): with index_file.open('w') as _f:
index_writer.writerow([uuid, dirname]) index_writer = csv.writer(_f)
for sub_path in sub_indexes: for uuid, dirname in current_index.items():
# Only keep the dir name index_writer.writerow([uuid, Path(dirname).name])
index_writer.writerow(['sub_index', sub_path.parent.name]) for sub_path in sorted(current_sub_index):
# Only keep the dir name
index_writer.writerow(['sub_index', sub_path])
return index_file return index_file
@ -356,7 +383,10 @@ class Archiver(AbstractManager):
indexed_captures = {} indexed_captures = {}
with index_path.open() as _i: with index_path.open() as _i:
for key, path_name in csv.reader(_i): for key, path_name in csv.reader(_i):
if key == 'sub_index' and not ignore_sub: if key == 'sub_index' and ignore_sub:
# We're not interested in the sub indexes and don't want them to land in indexed_captures
continue
elif key == 'sub_index' and not ignore_sub:
sub_index_file = index_path.parent / path_name / 'index' sub_index_file = index_path.parent / path_name / 'index'
if sub_index_file.exists(): if sub_index_file.exists():
indexed_captures.update(self.__load_index(sub_index_file)) indexed_captures.update(self.__load_index(sub_index_file))