From efe21247534b1b8952af7a782aa456145e4a93d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 20 Nov 2023 11:45:41 +0100 Subject: [PATCH] fix: Quit BG indexer when shutdown is requested. Improve exceptions handling in archiver --- README.md | 68 +++++++++++++++++++++++++++++++++++++++ bin/archiver.py | 10 ++++-- bin/background_indexer.py | 5 +++ 3 files changed, 80 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 79bcf648..dd905cb3 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,74 @@ pip install pylookyloo For more details on `pylookyloo`, read the overview [docs](https://www.lookyloo.eu/docs/main/pylookyloo-overview.html), the [documentation](https://pylookyloo.readthedocs.io/en/latest/) of the module itself, or the code in this [GitHub repository](https://github.com/Lookyloo/PyLookyloo). +# Notes regarding using S3FS for storage + +## Directory listing + +TL;DR: it is slow. + +If you have namy captures (say more than 1000/day), and store captures in a s3fs bucket mounted with s3fs-fuse, +doing a directory listing in bash (`ls`) will most probably lock the I/O for every process +trying to access any file in the whole bucket. The same will be true if you access the +filesystem using python methods (`iterdir`, `scandir`...)) + +A workaround is to use the python s3fs module as it will not access the filesystem for listing directories. +You can configure the s3fs credentials in `config/generic.json` key `s3fs`. + +## Versioning + +By default, a MinIO bucket (backend for s3fs) will have versioning enabled, wich means it +keeps a copy of every version of every file you're storing. It becomes a problem if you have a lot of captures +as the index files are updated on every change, and the max amount of versions is 10.000. +So by the time you have > 10.000 captures in a directory, you'll get I/O errors when you try +to update the index file. And you absolutely do not care about that versioning in lookyloo. + +To check if versioning is enabled (can be either enabled or suspended): + +``` +mc version info / +``` + +The command below will suspend versioning: + +```bash +mc version suspend / +``` + +And if you're already stuck with an index that was updated 10.000 times and you cannot do anything about it: + +```bash +mc rm --non-current --versions --recursive --force //path/to/index +``` + +Error message from bash (unhelpful): + +```bash +$ (git::main) rm /path/to/lookyloo/archived_captures/Year/Month/Day/index +rm: cannot remove '/path/to/lookyloo/archived_captures/Year/Month/Day/index': Input/output error +``` + +Python code: + +```python +from lookyloo.default import get_config +import s3fs + +s3fs_config = get_config('generic', 's3fs') +s3fs_client = s3fs.S3FileSystem(key=s3fs_config['config']['key'], + secret=s3fs_config['config']['secret'], + endpoint_url=s3fs_config['config']['endpoint_url']) + +s3fs_bucket = s3fs_config['config']['bucket_name'] +s3fs_client.rm_file(s3fs_bucket + '/Year/Month/Day/index') +``` + +Error from python (somewhat more helpful): +``` +OSError: [Errno 5] An error occurred (MaxVersionsExceeded) when calling the DeleteObject operation: You've exceeded the limit on the number of versions you can create on this object +``` + + # Contributing to Lookyloo To learn more about contributing to Lookyloo, see our [contributor guide](https://www.lookyloo.eu/docs/main/contributing.html). diff --git a/bin/archiver.py b/bin/archiver.py index 175ff08b..7b615495 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -324,9 +324,13 @@ class Archiver(AbstractManager): try: new_capture_path = self.__archive_single_capture(capture_path) capture_breakpoint -= 1 - except OSError as e: - self.logger.warning(f'Unable to archive capture: {e}') - finally: + except OSError: + self.logger.exception(f'Unable to archive capture {capture_path}') + (capture_path / 'lock').unlink(missing_ok=True) + except Exception: + self.logger.exception(f'Critical exception while archiving {capture_path}') + (capture_path / 'lock').unlink(missing_ok=True) + else: (new_capture_path / 'lock').unlink(missing_ok=True) if archiving_done: diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 24a9e6ef..0630e37d 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -45,7 +45,12 @@ class BackgroundIndexer(AbstractManager): archive_interval = timedelta(days=get_config('generic', 'archive')) cut_time = (datetime.now() - archive_interval) for month_dir in make_dirs_list(self.lookyloo.capture_dir): + __counter_shutdown = 0 for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True): + __counter_shutdown += 1 + if __counter_shutdown % 10 and self.shutdown_requested(): + self.logger.warning('Shutdown requested, breaking.') + return False if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()): # We already have a pickle file self.logger.debug(f'{path} has a pickle.')