mirror of https://github.com/CIRCL/lookyloo
new: Use S3FS in archiving script instead, remove python 3.12 uspport
parent
bf9ff87dac
commit
353015096e
|
@ -13,7 +13,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
|
@ -13,7 +13,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.10", "3.11", "3.12"]
|
python-version: ["3.10", "3.11"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
|
@ -14,6 +14,7 @@ from pathlib import Path
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
|
import s3fs # type: ignore
|
||||||
|
|
||||||
from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path, try_make_file
|
from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path, try_make_file
|
||||||
from lookyloo.helpers import get_captures_dir, is_locked
|
from lookyloo.helpers import get_captures_dir, is_locked
|
||||||
|
@ -46,6 +47,13 @@ class Archiver(AbstractManager):
|
||||||
s3fs_config = get_config('generic', 's3fs')
|
s3fs_config = get_config('generic', 's3fs')
|
||||||
if s3fs_config.get('archive_on_s3fs'):
|
if s3fs_config.get('archive_on_s3fs'):
|
||||||
self.archive_on_s3fs = True
|
self.archive_on_s3fs = True
|
||||||
|
self.s3fs_client = s3fs.S3FileSystem(key=s3fs_config['key'],
|
||||||
|
secret=s3fs_config['secret'],
|
||||||
|
endpoint_url=s3fs_config['endpoint_url'],
|
||||||
|
config_kwargs={'connect_timeout': 10,
|
||||||
|
'read_timeout': 900})
|
||||||
|
self.s3fs_bucket = s3fs_config['bucket_name']
|
||||||
|
self.s3fs_client.clear_multipart_uploads(self.s3fs_bucket)
|
||||||
|
|
||||||
def _to_run_forever(self):
|
def _to_run_forever(self):
|
||||||
archiving_done = False
|
archiving_done = False
|
||||||
|
@ -65,12 +73,19 @@ class Archiver(AbstractManager):
|
||||||
# This call takes a very long time on MinIO
|
# This call takes a very long time on MinIO
|
||||||
self._update_all_capture_indexes()
|
self._update_all_capture_indexes()
|
||||||
|
|
||||||
def _update_index(self, root_dir: Path) -> None:
|
def _update_index(self, root_dir: Path, *, s3fs: bool=False) -> None:
|
||||||
current_index: Dict[str, str] = {}
|
current_index: Dict[str, str] = {}
|
||||||
if not any(os.scandir(root_dir)):
|
if s3fs:
|
||||||
# the directory is empty, we can safely remove it
|
self.s3fs_client.invalidate_cache(str(root_dir))
|
||||||
root_dir.rmdir()
|
all_s3fs_captures = self.s3fs_client.ls(str(root_dir), detail=False, refresh=True)
|
||||||
return
|
if not all_s3fs_captures:
|
||||||
|
self.s3fs_client.rmdir(str(root_dir))
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
if not any(os.scandir(root_dir)):
|
||||||
|
# the directory is empty, we can safely remove it
|
||||||
|
root_dir.rmdir()
|
||||||
|
return
|
||||||
|
|
||||||
self.logger.debug(f'Updating index for {root_dir}')
|
self.logger.debug(f'Updating index for {root_dir}')
|
||||||
index_file = root_dir / 'index'
|
index_file = root_dir / 'index'
|
||||||
|
@ -89,9 +104,16 @@ class Archiver(AbstractManager):
|
||||||
|
|
||||||
curent_index_dirs = set(current_index.values())
|
curent_index_dirs = set(current_index.values())
|
||||||
|
|
||||||
with os.scandir(root_dir) as it:
|
if s3fs:
|
||||||
new_captures = {existing_capture.name for existing_capture in it
|
new_captures = {existing_capture.rsplit('/', 1)[-1] for existing_capture in all_s3fs_captures
|
||||||
if (existing_capture.name not in curent_index_dirs) and existing_capture.is_dir()}
|
if existing_capture.rsplit('/', 1)[-1]
|
||||||
|
and (existing_capture.rsplit('/', 1)[-1] not in curent_index_dirs)
|
||||||
|
and self.s3fs_client.is_dir(str(existing_capture))}
|
||||||
|
else:
|
||||||
|
with os.scandir(root_dir) as it:
|
||||||
|
new_captures = {existing_capture.name for existing_capture in it
|
||||||
|
if (existing_capture.name not in curent_index_dirs)
|
||||||
|
and existing_capture.is_dir()}
|
||||||
|
|
||||||
if not new_captures:
|
if not new_captures:
|
||||||
# No new captures, quitting
|
# No new captures, quitting
|
||||||
|
@ -102,9 +124,6 @@ class Archiver(AbstractManager):
|
||||||
|
|
||||||
for capture_dir_name in new_captures:
|
for capture_dir_name in new_captures:
|
||||||
capture_dir = root_dir / capture_dir_name
|
capture_dir = root_dir / capture_dir_name
|
||||||
if not capture_dir.is_dir():
|
|
||||||
self.logger.warning(f'{capture_dir} is not a directory')
|
|
||||||
continue
|
|
||||||
if not next(capture_dir.iterdir(), None):
|
if not next(capture_dir.iterdir(), None):
|
||||||
self.logger.warning(f'{capture_dir} is empty, removing.')
|
self.logger.warning(f'{capture_dir} is empty, removing.')
|
||||||
capture_dir.rmdir()
|
capture_dir.rmdir()
|
||||||
|
@ -116,20 +135,21 @@ class Archiver(AbstractManager):
|
||||||
continue
|
continue
|
||||||
with uuid_file.open() as _f:
|
with uuid_file.open() as _f:
|
||||||
uuid = _f.read().strip()
|
uuid = _f.read().strip()
|
||||||
try:
|
|
||||||
if not uuid:
|
|
||||||
self.logger.warning(f'{uuid_file} is empty')
|
|
||||||
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
|
||||||
continue
|
|
||||||
if uuid in current_index:
|
|
||||||
self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}')
|
|
||||||
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
|
||||||
continue
|
|
||||||
except OSError as e:
|
|
||||||
self.logger.warning(f'Error when discarding capture {capture_dir}: {e}')
|
|
||||||
continue
|
|
||||||
|
|
||||||
current_index[uuid] = uuid_file.parent.name
|
try:
|
||||||
|
if not uuid:
|
||||||
|
self.logger.warning(f'{uuid_file} is empty')
|
||||||
|
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
||||||
|
continue
|
||||||
|
if uuid in current_index:
|
||||||
|
self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}')
|
||||||
|
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
||||||
|
continue
|
||||||
|
except OSError as e:
|
||||||
|
self.logger.warning(f'Error when discarding capture {capture_dir}: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_index[uuid] = uuid_file.parent.name
|
||||||
|
|
||||||
if not current_index:
|
if not current_index:
|
||||||
# The directory has been archived. It is probably safe to unlink, but
|
# The directory has been archived. It is probably safe to unlink, but
|
||||||
|
@ -162,8 +182,8 @@ class Archiver(AbstractManager):
|
||||||
# Recent captures
|
# Recent captures
|
||||||
self.logger.info('Update recent indexes')
|
self.logger.info('Update recent indexes')
|
||||||
# NOTE: the call below will check the existence of every path ending with `uuid`,
|
# NOTE: the call below will check the existence of every path ending with `uuid`,
|
||||||
# it is extremely inneficient as we have many hundred of thusands of them
|
# it is extremely ineficient as we have many hundred of thusands of them
|
||||||
# and we only care about the rood directory (ex: 2023/06)
|
# and we only care about the root directory (ex: 2023/06)
|
||||||
# directories_to_index = {capture_dir.parent.parent
|
# directories_to_index = {capture_dir.parent.parent
|
||||||
# for capture_dir in get_captures_dir().glob('*/*/*/uuid')}
|
# for capture_dir in get_captures_dir().glob('*/*/*/uuid')}
|
||||||
for directory_to_index in self._make_dirs_list(get_captures_dir()):
|
for directory_to_index in self._make_dirs_list(get_captures_dir()):
|
||||||
|
@ -172,16 +192,13 @@ class Archiver(AbstractManager):
|
||||||
break
|
break
|
||||||
self._update_index(directory_to_index)
|
self._update_index(directory_to_index)
|
||||||
self.logger.info('Recent indexes updated')
|
self.logger.info('Recent indexes updated')
|
||||||
if self.archive_on_s3fs:
|
|
||||||
self.logger.info('Not updating indexes as they are on a s3fs-fuse mount.')
|
|
||||||
return
|
|
||||||
# Archived captures
|
# Archived captures
|
||||||
self.logger.info('Update archives indexes')
|
self.logger.info('Update archives indexes')
|
||||||
for directory_to_index in self._make_dirs_list(self.archived_captures_dir):
|
for directory_to_index in self._make_dirs_list(self.archived_captures_dir):
|
||||||
if self.shutdown_requested():
|
if self.shutdown_requested():
|
||||||
self.logger.warning('Shutdown requested, breaking.')
|
self.logger.warning('Shutdown requested, breaking.')
|
||||||
break
|
break
|
||||||
self._update_index(directory_to_index)
|
self._update_index(directory_to_index, s3fs=self.archive_on_s3fs)
|
||||||
self.logger.info('Archived indexes updated')
|
self.logger.info('Archived indexes updated')
|
||||||
|
|
||||||
def _archive(self):
|
def _archive(self):
|
||||||
|
|
|
@ -35,7 +35,7 @@ start_website = "bin.start_website:main"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.8,<3.13"
|
python = ">=3.8,<3.12"
|
||||||
requests = "^2.31.0"
|
requests = "^2.31.0"
|
||||||
flask = "^2.3.3"
|
flask = "^2.3.3"
|
||||||
gunicorn = "^21.2.0"
|
gunicorn = "^21.2.0"
|
||||||
|
|
Loading…
Reference in New Issue