mirror of https://github.com/CIRCL/lookyloo
new: Don't attempt to initialize indexes if they're on a s3fs mount
parent
e3b85508f1
commit
f2c9647a9e
|
@ -34,6 +34,19 @@ class Archiver(AbstractManager):
|
|||
|
||||
self._load_indexes()
|
||||
|
||||
# NOTE 2023-10-03: if we store the archived captures in s3fs (as it is the case in the CIRCL demo instance),
|
||||
# listing the directories directly with s3fs-fuse causes I/O errors and is making the interface unusable.
|
||||
# It is only a problem on directory listing and not when accessing a capture, so we only need to change the way
|
||||
# we generate the index files.
|
||||
# Other issue: the python module s3fs requires urllib < 2.0 (https://github.com/boto/botocore/issues/2926) so
|
||||
# we cannot run the script creating the indexes in the same virtual environment as the rest of the project.
|
||||
# The variable below will only be used to make sure we don't try to trigger a directory listing on a s3fs-fuse mount
|
||||
# and we're going to create the index files from another script, in tools/create_archive_indexes.
|
||||
self.archive_on_s3fs = False
|
||||
s3fs_config = get_config('generic', 's3fs')
|
||||
if s3fs_config.get('archive_on_s3fs'):
|
||||
self.archive_on_s3fs = True
|
||||
|
||||
def _to_run_forever(self):
|
||||
archiving_done = False
|
||||
# NOTE: When we archive a big directory, moving *a lot* of files, expecially to MinIO
|
||||
|
@ -159,6 +172,9 @@ class Archiver(AbstractManager):
|
|||
break
|
||||
self._update_index(directory_to_index)
|
||||
self.logger.info('Recent indexes updated')
|
||||
if self.archive_on_s3fs:
|
||||
self.logger.info('Not updating indexes as they are on a s3fs-fuse mount.')
|
||||
return
|
||||
# Archived captures
|
||||
self.logger.info('Update archives indexes')
|
||||
for directory_to_index in self._make_dirs_list(self.archived_captures_dir):
|
||||
|
@ -330,7 +346,7 @@ class Archiver(AbstractManager):
|
|||
|
||||
def main():
|
||||
a = Archiver()
|
||||
a.run(sleep_in_sec=36000)
|
||||
a.run(sleep_in_sec=3600)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -70,6 +70,15 @@
|
|||
"archive": 180,
|
||||
"max_capture_time": 3600,
|
||||
"max_tree_create_time": 120,
|
||||
"s3fs": {
|
||||
"archive_on_s3fs": false,
|
||||
"config": {
|
||||
"key": "",
|
||||
"secret": "",
|
||||
"endpoint_url": "",
|
||||
"bucket_name": ""
|
||||
}
|
||||
},
|
||||
"_notes": {
|
||||
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
||||
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
|
||||
|
@ -100,6 +109,7 @@
|
|||
"hide_captures_with_error": "Capturing an URL may result in an error (domain non-existent, HTTP error, ...). They may be useful to see, but if you have a public instance, they will clutter the index.",
|
||||
"archive": "The captures older than this value (in days) will be archived. They're not cached by default in the Lookyloo class.",
|
||||
"max_capture_time": "The very maximal time we allow a capture to keep going. Should only be triggered by captures that cause playwright to never quit.",
|
||||
"max_tree_create_time": "The max time the generation of a tree is allowed to take"
|
||||
"max_tree_create_time": "The max time the generation of a tree is allowed to take",
|
||||
"s3fs": "The config to access a S3FS instance with the s3fs python module - it is not integrated properly for now as it requires urllib < 2.0 which is a non-started at this stage."
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue