lookyloo/bin/archiver.py

#!/usr/bin/env python3

import csv
import gzip
import logging
import logging.config
import shutil

from collections import defaultdict
from collections.abc import Mapping
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional

from redis import Redis

from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path
from lookyloo.helpers import get_captures_dir

logging.config.dictConfig(get_config('logging'))


class Archiver(AbstractManager):

    def __init__(self, loglevel: Optional[int]=None):
        super().__init__(loglevel)
        self.script_name = 'archiver'
        self.redis = Redis(unix_socket_path=get_socket_path('cache'))

        # make sure archived captures dir exists
        self.archived_captures_dir = get_homedir() / 'archived_captures'
        self.archived_captures_dir.mkdir(parents=True, exist_ok=True)

        self._load_indexes()

    def _to_run_forever(self):
        self._archive()
        self._update_all_capture_indexes()
        self._load_indexes()
        self._compress_hars()

    def _update_index(self, root_dir: Path) -> None:
        current_index: Dict[str, str] = {}

        index_file = root_dir / 'index'
        if index_file.exists():
            # Skip index if the directory has been archived.
            existing_captures = index_file.parent.iterdir()
            try:
                with index_file.open('r') as _f:
                    current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures}
            except Exception as e:
                # the index file is broken, it will be recreated.
                self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')
                pass
            if not current_index:
                index_file.unlink()

        for uuid_file in root_dir.glob('*/uuid'):
            if uuid_file.parent.name in current_index.values():
                # The path is already in the index file, no need to read the uuid file
                continue
            with uuid_file.open() as _f:
                current_index[_f.read().strip()] = uuid_file.parent.name

        if not current_index:
            # The directory has been archived. It is probably safe to unlink, but
            # if it's not, we will lose a whole buch of captures. Moving instead for safety.
            root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name)
            return

        with index_file.open('w') as _f:
            index_writer = csv.writer(_f)
            for uuid, dirname in current_index.items():
                index_writer.writerow([uuid, dirname])

    def _update_all_capture_indexes(self):
        '''Run that after the captures are in the proper directories'''
        # Recent captures
        self.logger.info('Update recent indexes')
        directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}
        for directory_to_index in directories_to_index:
            self.logger.debug(f'Updating index for {directory_to_index}')
            self._update_index(directory_to_index)
        self.logger.info('Recent indexes updated')

        # Archived captures
        self.logger.info('Update archives indexes')
        directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}
        for directory_to_index in directories_to_index:
            self.logger.debug(f'Updating index for {directory_to_index}')
            self._update_index(directory_to_index)
        self.logger.info('Archived indexes updated')

    def _archive(self):
        archive_interval = timedelta(days=get_config('generic', 'archive'))
        cut_time = (datetime.now() - archive_interval).date()
        cut_time = cut_time.replace(day=1)

        # Format:
        # { 2020: { 12: [(directory, uuid)] } }
        to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
        for capture_uuid in get_captures_dir().rglob('uuid'):
            try:
                timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
            except ValueError:
                timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')
            if timestamp.date() >= cut_time:
                continue
            to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
            self.logger.info(f'Archiving {capture_uuid.parent}.')

        if not to_archive:
            self.logger.info('Nothing to archive.')
            return

        p = self.redis.pipeline()
        for year, month_captures in to_archive.items():
            for month, captures in month_captures.items():
                dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
                dest_dir.mkdir(parents=True, exist_ok=True)
                for capture_path in captures:
                    p.delete(str(capture_path))
                    (capture_path / 'tree.pickle').unlink(missing_ok=True)
                    (capture_path / 'tree.pickle.gz').unlink(missing_ok=True)
                    capture_path.rename(dest_dir / capture_path.name)
        p.execute()

        self.logger.info('Archiving done.')

    def _compress_hars(self):
        self.logger.info('Compressing archived captures')
        for index in self.archived_captures_dir.rglob('index'):
            with index.open('r') as _f:
                for uuid, dirname in csv.reader(_f):
                    for har in (index.parent / dirname).rglob('*.har'):
                        if not har.exists():
                            continue
                        with har.open('rb') as f_in:
                            with gzip.open(f'{har}.gz', 'wb') as f_out:
                                shutil.copyfileobj(f_in, f_out)
                        har.unlink()
        self.logger.info('Archived captures compressed')

    def _load_indexes(self):
        # Initialize archives
        for index in get_captures_dir().rglob('index'):
            with index.open('r') as _f:
                recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
            if recent_uuids:
                self.redis.hset('lookup_dirs', mapping=recent_uuids)
            else:
                index.unlink()
        self.logger.info('Recent indexes loaded')

        # Initialize archives
        for index in self.archived_captures_dir.rglob('index'):
            with index.open('r') as _f:
                archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}
            if archived_uuids:
                self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
                self.redis.hdel('lookup_dirs', *archived_uuids.keys())
            else:
                index.unlink()
        self.logger.info('Archived indexes loaded')


def main():
    a = Archiver()
    a.run(sleep_in_sec=3600)


if __name__ == '__main__':
    main()
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`#!/usr/bin/env python3`

			`import csv`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`import gzip`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`import logging`
new: Logging config in file 2022-11-23 15:54:22 +01:00			`import logging.config`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`import shutil`

chg: reorder imports 2021-09-07 12:59:31 +02:00			`from collections import defaultdict`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`from collections.abc import Mapping`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from datetime import datetime, timedelta`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`from pathlib import Path`
chg: Normalize logging on the config file settings 2023-04-05 16:23:46 +02:00			`from typing import Dict, List, Optional`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`from redis import Redis`

chg: use template 2021-10-18 13:06:43 +02:00			`from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path`
			`from lookyloo.helpers import get_captures_dir`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
new: Logging config in file 2022-11-23 15:54:22 +01:00			`logging.config.dictConfig(get_config('logging'))`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00

			`class Archiver(AbstractManager):`

chg: Normalize logging on the config file settings 2023-04-05 16:23:46 +02:00			`def __init__(self, loglevel: Optional[int]=None):`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`super().__init__(loglevel)`
			`self.script_name = 'archiver'`
fix: no decoding in archiver, catch exception when requesting hashes on broken capture 2023-03-16 14:47:24 +01:00			`self.redis = Redis(unix_socket_path=get_socket_path('cache'))`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00
			`# make sure archived captures dir exists`
chg: cleanup in the mail lookyloo class 2021-08-24 18:32:54 +02:00			`self.archived_captures_dir = get_homedir() / 'archived_captures'`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`self.archived_captures_dir.mkdir(parents=True, exist_ok=True)`

chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._load_indexes()`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`def _to_run_forever(self):`
			`self._archive()`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._update_all_capture_indexes()`
			`self._load_indexes()`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`self._compress_hars()`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
			`def _update_index(self, root_dir: Path) -> None:`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`current_index: Dict[str, str] = {}`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
			`index_file = root_dir / 'index'`
			`if index_file.exists():`
			`# Skip index if the directory has been archived.`
			`existing_captures = index_file.parent.iterdir()`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`try:`
			`with index_file.open('r') as _f:`
			`current_index = {uuid: dirname for uuid, dirname in csv.reader(_f) if (index_file.parent / dirname) in existing_captures}`
chg: Improve logging for archiver. 2022-09-23 14:32:42 +02:00			`except Exception as e:`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`# the index file is broken, it will be recreated.`
chg: Improve logging for archiver. 2022-09-23 14:32:42 +02:00			`self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`pass`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`if not current_index:`
			`index_file.unlink()`

			`for uuid_file in root_dir.glob('*/uuid'):`
			`if uuid_file.parent.name in current_index.values():`
			`# The path is already in the index file, no need to read the uuid file`
			`continue`
			`with uuid_file.open() as _f:`
			`current_index[_f.read().strip()] = uuid_file.parent.name`

			`if not current_index:`
chg: out of safety, do not remove a capture dir. 2021-08-30 12:54:17 +02:00			`# The directory has been archived. It is probably safe to unlink, but`
			`# if it's not, we will lose a whole buch of captures. Moving instead for safety.`
			`root_dir.rename(get_homedir() / 'discarded_captures' / root_dir.name)`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`return`

			`with index_file.open('w') as _f:`
			`index_writer = csv.writer(_f)`
			`for uuid, dirname in current_index.items():`
			`index_writer.writerow([uuid, dirname])`

			`def _update_all_capture_indexes(self):`
			`'''Run that after the captures are in the proper directories'''`
			`# Recent captures`
chg: Avoid captures without url(s) or document 2022-09-27 11:33:36 +02:00			`self.logger.info('Update recent indexes')`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`directories_to_index = {capture_dir.parent.parent for capture_dir in get_captures_dir().rglob('uuid')}`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`for directory_to_index in directories_to_index:`
chg: Avoid captures without url(s) or document 2022-09-27 11:33:36 +02:00			`self.logger.debug(f'Updating index for {directory_to_index}')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._update_index(directory_to_index)`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Recent indexes updated')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
			`# Archived captures`
chg: Avoid captures without url(s) or document 2022-09-27 11:33:36 +02:00			`self.logger.info('Update archives indexes')`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`directories_to_index = {capture_dir.parent.parent for capture_dir in self.archived_captures_dir.rglob('uuid')}`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`for directory_to_index in directories_to_index:`
chg: Avoid captures without url(s) or document 2022-09-27 11:33:36 +02:00			`self.logger.debug(f'Updating index for {directory_to_index}')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._update_index(directory_to_index)`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Archived indexes updated')`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`def _archive(self):`
			`archive_interval = timedelta(days=get_config('generic', 'archive'))`
chg: Make the cut-off date for archiving the 1st of the month 2021-08-23 15:36:59 +02:00			`cut_time = (datetime.now() - archive_interval).date()`
fix: properly match cut time 2021-08-23 15:51:06 +02:00			`cut_time = cut_time.replace(day=1)`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`# Format:`
			`# { 2020: { 12: [(directory, uuid)] } }`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`for capture_uuid in get_captures_dir().rglob('uuid'):`
fix: Avoid exception if microsec is missing. 2023-03-12 19:24:10 +01:00			`try:`
			`timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')`
			`except ValueError:`
			`timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')`
fix: properly match cut time 2021-08-23 15:51:06 +02:00			`if timestamp.date() >= cut_time:`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`continue`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)`
chg: Improve storage, support both modes. 2021-08-26 15:49:19 +02:00			`self.logger.info(f'Archiving {capture_uuid.parent}.')`
chg: Better use of cache, sanity checks 2021-08-23 12:17:44 +02:00
			`if not to_archive:`
			`self.logger.info('Nothing to archive.')`
			`return`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`p = self.redis.pipeline()`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`for year, month_captures in to_archive.items():`
			`for month, captures in month_captures.items():`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`dest_dir.mkdir(parents=True, exist_ok=True)`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`for capture_path in captures:`
			`p.delete(str(capture_path))`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`(capture_path / 'tree.pickle').unlink(missing_ok=True)`
fix: Cleanup prints, improve archiver. 2023-03-16 12:28:28 +01:00			`(capture_path / 'tree.pickle.gz').unlink(missing_ok=True)`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`capture_path.rename(dest_dir / capture_path.name)`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`p.execute()`

chg: Cleanup when dir is moved, digit months on 2 values 2021-08-23 14:53:19 +02:00			`self.logger.info('Archiving done.')`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`def _compress_hars(self):`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Compressing archived captures')`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`for index in self.archived_captures_dir.rglob('index'):`
			`with index.open('r') as _f:`
			`for uuid, dirname in csv.reader(_f):`
fix: Match compressed HAR as valid for rebuild 2022-09-28 11:23:44 +02:00			`for har in (index.parent / dirname).rglob('*.har'):`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`if not har.exists():`
			`continue`
			`with har.open('rb') as f_in:`
			`with gzip.open(f'{har}.gz', 'wb') as f_out:`
			`shutil.copyfileobj(f_in, f_out)`
			`har.unlink()`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Archived captures compressed')`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`def _load_indexes(self):`
			`# Initialize archives`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`for index in get_captures_dir().rglob('index'):`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`with index.open('r') as _f:`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`recent_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`if recent_uuids:`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`self.redis.hset('lookup_dirs', mapping=recent_uuids)`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`else:`
			`index.unlink()`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Recent indexes loaded')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`# Initialize archives`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`for index in self.archived_captures_dir.rglob('index'):`
chg: Improve storage, support both modes. 2021-08-26 15:49:19 +02:00			`with index.open('r') as _f:`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`archived_uuids: Mapping = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f) if (index.parent / dirname).exists()}`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`if archived_uuids:`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.redis.hdel('lookup_dirs', *archived_uuids.keys())`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`else:`
			`index.unlink()`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Archived indexes loaded')`
chg: Force init the archived indexes 2021-08-23 15:14:08 +02:00
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`def main():`
			`a = Archiver()`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`a.run(sleep_in_sec=3600)`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00

			`if __name__ == '__main__':`
			`main()`