new: Don't attempt to initialize indexes if they're on a s3fs mount

pull/799/head
Raphaël Vinot 2023-10-04 11:06:02 +02:00
parent e3b85508f1
commit f2c9647a9e
2 changed files with 28 additions and 2 deletions

View File

@ -34,6 +34,19 @@ class Archiver(AbstractManager):
self._load_indexes()
# NOTE 2023-10-03: if we store the archived captures in s3fs (as it is the case in the CIRCL demo instance),
# listing the directories directly with s3fs-fuse causes I/O errors and is making the interface unusable.
# It is only a problem on directory listing and not when accessing a capture, so we only need to change the way
# we generate the index files.
# Other issue: the python module s3fs requires urllib < 2.0 (https://github.com/boto/botocore/issues/2926) so
# we cannot run the script creating the indexes in the same virtual environment as the rest of the project.
# The variable below will only be used to make sure we don't try to trigger a directory listing on a s3fs-fuse mount
# and we're going to create the index files from another script, in tools/create_archive_indexes.
self.archive_on_s3fs = False
s3fs_config = get_config('generic', 's3fs')
if s3fs_config.get('archive_on_s3fs'):
self.archive_on_s3fs = True
def _to_run_forever(self):
archiving_done = False
# NOTE: When we archive a big directory, moving *a lot* of files, expecially to MinIO
@ -159,6 +172,9 @@ class Archiver(AbstractManager):
break
self._update_index(directory_to_index)
self.logger.info('Recent indexes updated')
if self.archive_on_s3fs:
self.logger.info('Not updating indexes as they are on a s3fs-fuse mount.')
return
# Archived captures
self.logger.info('Update archives indexes')
for directory_to_index in self._make_dirs_list(self.archived_captures_dir):
@ -330,7 +346,7 @@ class Archiver(AbstractManager):
def main():
a = Archiver()
a.run(sleep_in_sec=36000)
a.run(sleep_in_sec=3600)
if __name__ == '__main__':

View File

@ -70,6 +70,15 @@
"archive": 180,
"max_capture_time": 3600,
"max_tree_create_time": 120,
"s3fs": {
"archive_on_s3fs": false,
"config": {
"key": "",
"secret": "",
"endpoint_url": "",
"bucket_name": ""
}
},
"_notes": {
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
@ -100,6 +109,7 @@
"hide_captures_with_error": "Capturing an URL may result in an error (domain non-existent, HTTP error, ...). They may be useful to see, but if you have a public instance, they will clutter the index.",
"archive": "The captures older than this value (in days) will be archived. They're not cached by default in the Lookyloo class.",
"max_capture_time": "The very maximal time we allow a capture to keep going. Should only be triggered by captures that cause playwright to never quit.",
"max_tree_create_time": "The max time the generation of a tree is allowed to take"
"max_tree_create_time": "The max time the generation of a tree is allowed to take",
"s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage."
}
}