2021-08-20 17:46:22 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import csv
|
2022-07-12 18:44:33 +02:00
|
|
|
import gzip
|
2021-08-20 17:46:22 +02:00
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2023-08-06 21:34:20 +02:00
|
|
|
import os
|
2022-07-12 18:44:33 +02:00
|
|
|
import shutil
|
|
|
|
|
2021-09-07 12:59:31 +02:00
|
|
|
from collections import defaultdict
|
2022-05-23 00:15:52 +02:00
|
|
|
from collections.abc import Mapping
|
2023-08-04 14:02:45 +02:00
|
|
|
from datetime import datetime, timedelta, date
|
2021-08-20 17:46:22 +02:00
|
|
|
from pathlib import Path
|
2023-04-05 16:23:46 +02:00
|
|
|
from typing import Dict, List, Optional
|
2021-08-20 17:46:22 +02:00
|
|
|
|
2021-08-24 17:10:14 +02:00
|
|
|
from redis import Redis
|
|
|
|
|
2021-10-18 13:06:43 +02:00
|
|
|
from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path
|
|
|
|
from lookyloo.helpers import get_captures_dir
|
2021-08-20 17:46:22 +02:00
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2021-08-20 17:46:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
class Archiver(AbstractManager):
|
|
|
|
|
2023-04-05 16:23:46 +02:00
|
|
|
def __init__(self, loglevel: Optional[int]=None):
|
2021-08-20 17:46:22 +02:00
|
|
|
super().__init__(loglevel)
|
|
|
|
self.script_name = 'archiver'
|
2023-03-16 14:47:24 +01:00
|
|
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'))
|
2021-08-24 17:10:14 +02:00
|
|
|
|
|
|
|
# make sure archived captures dir exists
|
2021-08-24 18:32:54 +02:00
|
|
|
self.archived_captures_dir = get_homedir() / 'archived_captures'
|
2021-08-24 17:10:14 +02:00
|
|
|
self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
2021-08-30 12:48:13 +02:00
|
|
|
self._load_indexes()
|
2021-08-20 17:46:22 +02:00
|
|
|
|
|
|
|
def _to_run_forever(self):
|
|
|
|
self._archive()
|
2021-08-30 12:48:13 +02:00
|
|
|
self._update_all_capture_indexes()
|
|
|
|
self._load_indexes()
|
2023-08-11 13:16:59 +02:00
|
|
|
# The HARs are supposedly all compressed so this call shouldn't be required
|
|
|
|
# unless you're processing old captures for the first time.
|
|
|
|
# self._compress_hars()
|
2021-08-30 12:48:13 +02:00
|
|
|
|
|
|
|
def _update_index(self, root_dir: Path) -> None:
|
2021-11-26 18:36:35 +01:00
|
|
|
current_index: Dict[str, str] = {}
|
2023-08-06 21:34:20 +02:00
|
|
|
if not os.listdir(root_dir):
|
|
|
|
# the directory is empty, we can safely remove it
|
|
|
|
root_dir.rmdir()
|
|
|
|
return
|
2021-08-30 12:48:13 +02:00
|
|
|
|
2023-08-07 13:13:57 +02:00
|
|
|
self.logger.debug(f'Updating index for {root_dir}')
|
2021-08-30 12:48:13 +02:00
|
|
|
index_file = root_dir / 'index'
|
2023-08-07 13:13:57 +02:00
|
|
|
existing_captures_names = {existing_capture.name for existing_capture in index_file.parent.iterdir()
|
|
|
|
if existing_capture.name != 'index'}
|
2021-08-30 12:48:13 +02:00
|
|
|
if index_file.exists():
|
|
|
|
# Skip index if the directory has been archived.
|
2021-11-26 18:36:35 +01:00
|
|
|
try:
|
|
|
|
with index_file.open('r') as _f:
|
2023-08-05 20:47:08 +02:00
|
|
|
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)
|
2023-08-07 13:13:57 +02:00
|
|
|
if uuid
|
|
|
|
and dirname
|
|
|
|
and dirname in existing_captures_names}
|
2022-09-23 14:32:42 +02:00
|
|
|
except Exception as e:
|
2021-11-26 18:36:35 +01:00
|
|
|
# the index file is broken, it will be recreated.
|
2022-09-23 14:32:42 +02:00
|
|
|
self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')
|
2021-11-26 18:36:35 +01:00
|
|
|
pass
|
2021-08-30 12:48:13 +02:00
|
|
|
if not current_index:
|
|
|
|
index_file.unlink()
|
2023-08-07 13:13:57 +02:00
|
|
|
|
|
|
|
if set(current_index.values()) == existing_captures_names:
|
|
|
|
# No new captures, quitting
|
|
|
|
self.logger.debug(f'No new captures in {root_dir}.')
|
|
|
|
return
|
|
|
|
|
|
|
|
new_captures = sorted(existing_captures_names - set(current_index.values()), reverse=True)
|
|
|
|
self.logger.info(f'{len(new_captures)} new captures in {root_dir}.')
|
|
|
|
|
|
|
|
for capture_dir_name in new_captures:
|
|
|
|
capture_dir = root_dir / capture_dir_name
|
2023-08-06 21:34:20 +02:00
|
|
|
if not capture_dir.is_dir():
|
2023-08-07 13:13:57 +02:00
|
|
|
self.logger.warning(f'{capture_dir} is not a directory')
|
2021-08-30 12:48:13 +02:00
|
|
|
continue
|
2023-08-16 11:15:00 +02:00
|
|
|
if not next(capture_dir.iterdir(), None):
|
|
|
|
self.logger.warning(f'{capture_dir} is empty, removing.')
|
|
|
|
capture_dir.rmdir()
|
|
|
|
continue
|
2023-08-06 21:34:20 +02:00
|
|
|
uuid_file = capture_dir / 'uuid'
|
|
|
|
if not uuid_file.exists():
|
|
|
|
self.logger.warning(f'No UUID file in {capture_dir}.')
|
2023-08-08 12:41:21 +02:00
|
|
|
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
2023-08-06 21:34:20 +02:00
|
|
|
continue
|
2021-08-30 12:48:13 +02:00
|
|
|
with uuid_file.open() as _f:
|
2023-08-07 13:13:57 +02:00
|
|
|
uuid = _f.read().strip()
|
|
|
|
if not uuid:
|
|
|
|
self.logger.warning(f'{uuid_file} is empty')
|
2023-08-08 12:41:21 +02:00
|
|
|
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
2023-08-07 13:13:57 +02:00
|
|
|
continue
|
|
|
|
if uuid in current_index:
|
|
|
|
self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}')
|
2023-08-08 12:41:21 +02:00
|
|
|
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
|
2023-08-07 13:13:57 +02:00
|
|
|
continue
|
|
|
|
current_index[uuid] = uuid_file.parent.name
|
2021-08-30 12:48:13 +02:00
|
|
|
|
|
|
|
if not current_index:
|
2021-08-30 12:54:17 +02:00
|
|
|
# The directory has been archived. It is probably safe to unlink, but
|
|
|
|
# if it's not, we will lose a whole buch of captures. Moving instead for safety.
|
2023-08-06 21:34:20 +02:00
|
|
|
shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name))
|
2021-08-30 12:48:13 +02:00
|
|
|
return
|
|
|
|
|
|
|
|
with index_file.open('w') as _f:
|
|
|
|
index_writer = csv.writer(_f)
|
|
|
|
for uuid, dirname in current_index.items():
|
|
|
|
index_writer.writerow([uuid, dirname])
|
|
|
|
|
2023-08-05 20:47:08 +02:00
|
|
|
def _make_dirs_list(self, root_dir: Path) -> List[Path]:
|
|
|
|
directories = []
|
|
|
|
year_now = date.today().year
|
|
|
|
while True:
|
|
|
|
year_dir = root_dir / str(year_now)
|
|
|
|
if not year_dir.exists():
|
|
|
|
# if we do not have a directory with this year, quit the loop
|
|
|
|
break
|
|
|
|
for month in range(12, 0, -1):
|
|
|
|
month_dir = year_dir / f'{month:02}'
|
|
|
|
if month_dir.exists():
|
|
|
|
directories.append(month_dir)
|
|
|
|
year_now -= 1
|
|
|
|
return directories
|
|
|
|
|
2021-08-30 12:48:13 +02:00
|
|
|
def _update_all_capture_indexes(self):
|
|
|
|
'''Run that after the captures are in the proper directories'''
|
|
|
|
# Recent captures
|
2022-09-27 11:33:36 +02:00
|
|
|
self.logger.info('Update recent indexes')
|
2023-08-05 20:47:08 +02:00
|
|
|
# NOTE: the call below will check the existence of every path ending with `uuid`,
|
|
|
|
# it is extremely inneficient as we have many hundred of thusands of them
|
|
|
|
# and we only care about the rood directory (ex: 2023/06)
|
|
|
|
# directories_to_index = {capture_dir.parent.parent
|
|
|
|
# for capture_dir in get_captures_dir().glob('*/*/*/uuid')}
|
|
|
|
for directory_to_index in self._make_dirs_list(get_captures_dir()):
|
2023-08-07 13:13:57 +02:00
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
2021-08-30 12:48:13 +02:00
|
|
|
self._update_index(directory_to_index)
|
2022-07-27 14:33:28 +02:00
|
|
|
self.logger.info('Recent indexes updated')
|
2021-08-30 12:48:13 +02:00
|
|
|
# Archived captures
|
2022-09-27 11:33:36 +02:00
|
|
|
self.logger.info('Update archives indexes')
|
2023-08-05 20:47:08 +02:00
|
|
|
for directory_to_index in self._make_dirs_list(self.archived_captures_dir):
|
2023-08-04 14:02:45 +02:00
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
2021-08-30 12:48:13 +02:00
|
|
|
self._update_index(directory_to_index)
|
2022-07-27 14:33:28 +02:00
|
|
|
self.logger.info('Archived indexes updated')
|
2021-08-20 17:46:22 +02:00
|
|
|
|
|
|
|
def _archive(self):
|
|
|
|
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
2021-08-23 15:36:59 +02:00
|
|
|
cut_time = (datetime.now() - archive_interval).date()
|
2021-08-23 15:51:06 +02:00
|
|
|
cut_time = cut_time.replace(day=1)
|
2021-08-20 17:46:22 +02:00
|
|
|
|
|
|
|
# Format:
|
|
|
|
# { 2020: { 12: [(directory, uuid)] } }
|
2021-08-30 12:48:13 +02:00
|
|
|
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
|
2023-08-04 14:02:45 +02:00
|
|
|
# In order to avoid scanning the complete directory on each run, we check if year and month are
|
|
|
|
# older than the cut time.
|
|
|
|
for index in get_captures_dir().glob('*/*/index'):
|
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
|
|
|
month = int(index.parent.name)
|
|
|
|
year = int(index.parent.parent.name)
|
|
|
|
if date(year, month, 1) >= cut_time:
|
2021-08-20 17:46:22 +02:00
|
|
|
continue
|
2023-08-04 14:02:45 +02:00
|
|
|
|
|
|
|
for capture_uuid in index.parent.glob('*/uuid'):
|
|
|
|
try:
|
|
|
|
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
|
|
|
except ValueError:
|
|
|
|
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')
|
|
|
|
if timestamp.date() >= cut_time:
|
|
|
|
continue
|
|
|
|
to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
|
|
|
|
self.logger.info(f'Archiving {capture_uuid.parent}.')
|
2021-08-23 12:17:44 +02:00
|
|
|
|
|
|
|
if not to_archive:
|
|
|
|
self.logger.info('Nothing to archive.')
|
|
|
|
return
|
2021-08-20 17:46:22 +02:00
|
|
|
|
2021-08-30 12:48:13 +02:00
|
|
|
p = self.redis.pipeline()
|
2021-08-20 17:46:22 +02:00
|
|
|
for year, month_captures in to_archive.items():
|
|
|
|
for month, captures in month_captures.items():
|
2021-08-24 17:10:14 +02:00
|
|
|
dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
|
2021-08-20 17:46:22 +02:00
|
|
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
2023-08-20 16:21:33 +02:00
|
|
|
capture_breakpoint = 1000
|
2021-08-30 12:48:13 +02:00
|
|
|
for capture_path in captures:
|
2023-08-20 16:21:33 +02:00
|
|
|
capture_breakpoint -= 1
|
|
|
|
if capture_breakpoint <= 0:
|
|
|
|
# Break and restart later
|
|
|
|
self.logger.info('Archived many captures in {year}-{month}, will keep going later.')
|
|
|
|
break
|
|
|
|
elif capture_breakpoint % 100:
|
|
|
|
# Just check if we requested a shutdown.
|
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
2021-08-30 12:48:13 +02:00
|
|
|
p.delete(str(capture_path))
|
2023-08-11 13:16:59 +02:00
|
|
|
# If the HAR isn't archived yet, archive it before copy
|
|
|
|
for har in capture_path.glob('*.har'):
|
|
|
|
with har.open('rb') as f_in:
|
|
|
|
with gzip.open(f'{har}.gz', 'wb') as f_out:
|
|
|
|
shutil.copyfileobj(f_in, f_out)
|
|
|
|
har.unlink()
|
2023-08-20 16:21:33 +02:00
|
|
|
(capture_path / 'tree.pickle').unlink(missing_ok=True)
|
|
|
|
(capture_path / 'tree.pickle.gz').unlink(missing_ok=True)
|
2023-08-08 12:41:21 +02:00
|
|
|
shutil.move(str(capture_path), str(dest_dir))
|
2021-08-30 12:48:13 +02:00
|
|
|
p.execute()
|
|
|
|
|
2021-08-23 14:53:19 +02:00
|
|
|
self.logger.info('Archiving done.')
|
2021-08-20 17:46:22 +02:00
|
|
|
|
2022-07-12 18:44:33 +02:00
|
|
|
def _compress_hars(self):
|
2023-08-11 13:16:59 +02:00
|
|
|
"""This method is very slow (it checks every single capture for non-compressed HARs)
|
|
|
|
The new approach is to compress the har of every capture by default so this shouldn't be
|
|
|
|
needed anymore. Keeping it here just for reference, or to process old archives that contain
|
|
|
|
non-gziped HARs.
|
|
|
|
"""
|
2022-07-27 14:33:28 +02:00
|
|
|
self.logger.info('Compressing archived captures')
|
2023-08-04 13:15:03 +02:00
|
|
|
for index in self.archived_captures_dir.glob('*/*/index'):
|
2023-08-04 14:02:45 +02:00
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
2022-07-12 18:44:33 +02:00
|
|
|
with index.open('r') as _f:
|
|
|
|
for uuid, dirname in csv.reader(_f):
|
2023-08-11 13:16:59 +02:00
|
|
|
for har in (index.parent / dirname).glob('*.har'):
|
2022-07-12 18:44:33 +02:00
|
|
|
with har.open('rb') as f_in:
|
|
|
|
with gzip.open(f'{har}.gz', 'wb') as f_out:
|
|
|
|
shutil.copyfileobj(f_in, f_out)
|
|
|
|
har.unlink()
|
2022-07-27 14:33:28 +02:00
|
|
|
self.logger.info('Archived captures compressed')
|
2022-07-12 18:44:33 +02:00
|
|
|
|
2021-08-30 12:48:13 +02:00
|
|
|
def _load_indexes(self):
|
|
|
|
# Initialize archives
|
2023-08-04 13:15:03 +02:00
|
|
|
for index in get_captures_dir().glob('*/*/index'):
|
2023-08-04 14:02:45 +02:00
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
|
|
|
|
2023-08-05 13:36:56 +02:00
|
|
|
self.logger.info(f'Loading {index}')
|
2021-08-30 12:48:13 +02:00
|
|
|
with index.open('r') as _f:
|
2023-08-05 20:47:08 +02:00
|
|
|
recent_uuids: Mapping = {uuid: str(index.parent / dirname)
|
|
|
|
for uuid, dirname in csv.reader(_f)
|
|
|
|
if (index.parent / dirname).exists()}
|
2021-08-30 14:45:44 +02:00
|
|
|
if recent_uuids:
|
2023-08-05 13:36:56 +02:00
|
|
|
self.logger.info(f'{len(recent_uuids)} captures in directory.')
|
2022-05-23 00:15:52 +02:00
|
|
|
self.redis.hset('lookup_dirs', mapping=recent_uuids)
|
2021-08-30 14:45:44 +02:00
|
|
|
else:
|
|
|
|
index.unlink()
|
2022-07-27 14:33:28 +02:00
|
|
|
self.logger.info('Recent indexes loaded')
|
2021-08-30 12:48:13 +02:00
|
|
|
|
2023-08-05 13:36:56 +02:00
|
|
|
already_archived_uuids = {k.decode() for k in self.redis.hkeys('lookup_dirs_archived')}
|
|
|
|
self.logger.info(f'Already have {len(already_archived_uuids)} UUIDs archived')
|
2021-08-24 17:10:14 +02:00
|
|
|
# Initialize archives
|
2023-08-05 13:36:56 +02:00
|
|
|
for index in sorted(self.archived_captures_dir.glob('*/*/index'), reverse=True):
|
2023-08-04 14:02:45 +02:00
|
|
|
if self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
break
|
2023-08-05 13:36:56 +02:00
|
|
|
self.logger.debug(f'Loading {index}')
|
2021-08-26 15:49:19 +02:00
|
|
|
with index.open('r') as _f:
|
2023-08-05 20:47:08 +02:00
|
|
|
archived_uuids: Mapping = {uuid: index.parent / dirname
|
|
|
|
for uuid, dirname in csv.reader(_f)}
|
2021-08-30 14:45:44 +02:00
|
|
|
if archived_uuids:
|
2023-08-05 13:36:56 +02:00
|
|
|
self.logger.debug(f'{len(archived_uuids)} captures in directory.')
|
|
|
|
new_uuids = set(archived_uuids.keys()) - already_archived_uuids
|
|
|
|
if not new_uuids:
|
|
|
|
self.logger.debug('No new archived UUID to check.')
|
|
|
|
continue
|
|
|
|
|
|
|
|
self.logger.info(f'Loading {index}, {len(archived_uuids)} captures in directory, {len(new_uuids)} archived UUID to check.')
|
2023-08-04 15:03:58 +02:00
|
|
|
# NOTE: Only check if the directory exists if the UUID isn't in the cache.
|
2023-08-05 20:47:08 +02:00
|
|
|
self.redis.hset('lookup_dirs_archived',
|
|
|
|
mapping={uuid: str(dirname)
|
|
|
|
for uuid, dirname in archived_uuids.items()
|
|
|
|
if uuid in new_uuids and dirname.exists()})
|
2022-07-27 14:33:28 +02:00
|
|
|
self.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
2021-08-30 14:45:44 +02:00
|
|
|
else:
|
|
|
|
index.unlink()
|
2022-07-27 14:33:28 +02:00
|
|
|
self.logger.info('Archived indexes loaded')
|
2021-08-23 15:14:08 +02:00
|
|
|
|
2021-08-20 17:46:22 +02:00
|
|
|
|
|
|
|
def main():
|
|
|
|
a = Archiver()
|
2021-08-30 12:48:13 +02:00
|
|
|
a.run(sleep_in_sec=3600)
|
2021-08-20 17:46:22 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|