lookyloo/bin/archiver.py

#!/usr/bin/env python3

import csv
import gzip
import logging
import logging.config
import os
import shutil

from collections import defaultdict
from collections.abc import Mapping
from datetime import datetime, timedelta, date
from pathlib import Path
from typing import Dict, List, Optional

from redis import Redis

from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path
from lookyloo.helpers import get_captures_dir

logging.config.dictConfig(get_config('logging'))


class Archiver(AbstractManager):

    def __init__(self, loglevel: Optional[int]=None):
        super().__init__(loglevel)
        self.script_name = 'archiver'
        self.redis = Redis(unix_socket_path=get_socket_path('cache'))

        # make sure archived captures dir exists
        self.archived_captures_dir = get_homedir() / 'archived_captures'
        self.archived_captures_dir.mkdir(parents=True, exist_ok=True)

        self._load_indexes()

    def _to_run_forever(self):
        self._archive()
        self._update_all_capture_indexes()
        self._load_indexes()
        # The HARs are supposedly all compressed so this call shouldn't be required
        # unless you're processing old captures for the first time.
        # self._compress_hars()

    def _update_index(self, root_dir: Path) -> None:
        current_index: Dict[str, str] = {}
        if not os.listdir(root_dir):
            # the directory is empty, we can safely remove it
            root_dir.rmdir()
            return

        self.logger.debug(f'Updating index for {root_dir}')
        index_file = root_dir / 'index'
        existing_captures_names = {existing_capture.name for existing_capture in index_file.parent.iterdir()
                                   if existing_capture.name != 'index'}
        if index_file.exists():
            # Skip index if the directory has been archived.
            try:
                with index_file.open('r') as _f:
                    current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)
                                     if uuid
                                     and dirname
                                     and dirname in existing_captures_names}
            except Exception as e:
                # the index file is broken, it will be recreated.
                self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')
                pass
            if not current_index:
                index_file.unlink()

        if set(current_index.values()) == existing_captures_names:
            # No new captures, quitting
            self.logger.debug(f'No new captures in {root_dir}.')
            return

        new_captures = sorted(existing_captures_names - set(current_index.values()), reverse=True)
        self.logger.info(f'{len(new_captures)} new captures in {root_dir}.')

        for capture_dir_name in new_captures:
            capture_dir = root_dir / capture_dir_name
            if not capture_dir.is_dir():
                self.logger.warning(f'{capture_dir} is not a directory')
                continue
            if not next(capture_dir.iterdir(), None):
                self.logger.warning(f'{capture_dir} is empty, removing.')
                capture_dir.rmdir()
                continue
            uuid_file = capture_dir / 'uuid'
            if not uuid_file.exists():
                self.logger.warning(f'No UUID file in {capture_dir}.')
                shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
                continue
            with uuid_file.open() as _f:
                uuid = _f.read().strip()
                if not uuid:
                    self.logger.warning(f'{uuid_file} is empty')
                    shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
                    continue
                if uuid in current_index:
                    self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}')
                    shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
                    continue
                current_index[uuid] = uuid_file.parent.name

        if not current_index:
            # The directory has been archived. It is probably safe to unlink, but
            # if it's not, we will lose a whole buch of captures. Moving instead for safety.
            shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name))
            return

        with index_file.open('w') as _f:
            index_writer = csv.writer(_f)
            for uuid, dirname in current_index.items():
                index_writer.writerow([uuid, dirname])

    def _make_dirs_list(self, root_dir: Path) -> List[Path]:
        directories = []
        year_now = date.today().year
        while True:
            year_dir = root_dir / str(year_now)
            if not year_dir.exists():
                # if we do not have a directory with this year, quit the loop
                break
            for month in range(12, 0, -1):
                month_dir = year_dir / f'{month:02}'
                if month_dir.exists():
                    directories.append(month_dir)
            year_now -= 1
        return directories

    def _update_all_capture_indexes(self):
        '''Run that after the captures are in the proper directories'''
        # Recent captures
        self.logger.info('Update recent indexes')
        # NOTE: the call below will check the existence of every path ending with `uuid`,
        #       it is extremely inneficient as we have many hundred of thusands of them
        #       and we only care about the rood directory (ex: 2023/06)
        # directories_to_index = {capture_dir.parent.parent
        #                        for capture_dir in get_captures_dir().glob('*/*/*/uuid')}
        for directory_to_index in self._make_dirs_list(get_captures_dir()):
            if self.shutdown_requested():
                self.logger.warning('Shutdown requested, breaking.')
                break
            self._update_index(directory_to_index)
        self.logger.info('Recent indexes updated')
        # Archived captures
        self.logger.info('Update archives indexes')
        for directory_to_index in self._make_dirs_list(self.archived_captures_dir):
            if self.shutdown_requested():
                self.logger.warning('Shutdown requested, breaking.')
                break
            self._update_index(directory_to_index)
        self.logger.info('Archived indexes updated')

    def _archive(self):
        archive_interval = timedelta(days=get_config('generic', 'archive'))
        cut_time = (datetime.now() - archive_interval).date()
        cut_time = cut_time.replace(day=1)

        # Format:
        # { 2020: { 12: [(directory, uuid)] } }
        to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))
        # In order to avoid scanning the complete directory on each run, we check if year and month are
        # older than the cut time.
        for index in get_captures_dir().glob('*/*/index'):
            if self.shutdown_requested():
                self.logger.warning('Shutdown requested, breaking.')
                break
            month = int(index.parent.name)
            year = int(index.parent.parent.name)
            if date(year, month, 1) >= cut_time:
                continue

            for capture_uuid in index.parent.glob('*/uuid'):
                try:
                    timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
                except ValueError:
                    timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')
                if timestamp.date() >= cut_time:
                    continue
                to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
                self.logger.info(f'Archiving {capture_uuid.parent}.')

        if not to_archive:
            self.logger.info('Nothing to archive.')
            return

        p = self.redis.pipeline()
        for year, month_captures in to_archive.items():
            for month, captures in month_captures.items():
                dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
                dest_dir.mkdir(parents=True, exist_ok=True)
                capture_breakpoint = 1000
                for capture_path in captures:
                    capture_breakpoint -= 1
                    if capture_breakpoint <= 0:
                        # Break and restart later
                        self.logger.info('Archived many captures in {year}-{month}, will keep going later.')
                        break
                    elif capture_breakpoint % 100:
                        # Just check if we requested a shutdown.
                        if self.shutdown_requested():
                            self.logger.warning('Shutdown requested, breaking.')
                            break
                    p.delete(str(capture_path))
                    # If the HAR isn't archived yet, archive it before copy
                    for har in capture_path.glob('*.har'):
                        with har.open('rb') as f_in:
                            with gzip.open(f'{har}.gz', 'wb') as f_out:
                                shutil.copyfileobj(f_in, f_out)
                        har.unlink()
                    (capture_path / 'tree.pickle').unlink(missing_ok=True)
                    (capture_path / 'tree.pickle.gz').unlink(missing_ok=True)
                    shutil.move(str(capture_path), str(dest_dir))
        p.execute()

        self.logger.info('Archiving done.')

    def _compress_hars(self):
        """This method is very slow (it checks every single capture for non-compressed HARs)
        The new approach is to compress the har of every capture by default so this shouldn't be
        needed anymore. Keeping it here just for reference, or to process old archives that contain
        non-gziped HARs.
        """
        self.logger.info('Compressing archived captures')
        for index in self.archived_captures_dir.glob('*/*/index'):
            if self.shutdown_requested():
                self.logger.warning('Shutdown requested, breaking.')
                break
            with index.open('r') as _f:
                for uuid, dirname in csv.reader(_f):
                    for har in (index.parent / dirname).glob('*.har'):
                        with har.open('rb') as f_in:
                            with gzip.open(f'{har}.gz', 'wb') as f_out:
                                shutil.copyfileobj(f_in, f_out)
                        har.unlink()
        self.logger.info('Archived captures compressed')

    def _load_indexes(self):
        # Initialize archives
        for index in get_captures_dir().glob('*/*/index'):
            if self.shutdown_requested():
                self.logger.warning('Shutdown requested, breaking.')
                break

            self.logger.info(f'Loading {index}')
            with index.open('r') as _f:
                recent_uuids: Mapping = {uuid: str(index.parent / dirname)
                                         for uuid, dirname in csv.reader(_f)
                                         if (index.parent / dirname).exists()}
            if recent_uuids:
                self.logger.info(f'{len(recent_uuids)} captures in directory.')
                self.redis.hset('lookup_dirs', mapping=recent_uuids)
            else:
                index.unlink()
        self.logger.info('Recent indexes loaded')

        already_archived_uuids = {k.decode() for k in self.redis.hkeys('lookup_dirs_archived')}
        self.logger.info(f'Already have {len(already_archived_uuids)} UUIDs archived')
        # Initialize archives
        for index in sorted(self.archived_captures_dir.glob('*/*/index'), reverse=True):
            if self.shutdown_requested():
                self.logger.warning('Shutdown requested, breaking.')
                break
            self.logger.debug(f'Loading {index}')
            with index.open('r') as _f:
                archived_uuids: Mapping = {uuid: index.parent / dirname
                                           for uuid, dirname in csv.reader(_f)}
            if archived_uuids:
                self.logger.debug(f'{len(archived_uuids)} captures in directory.')
                new_uuids = set(archived_uuids.keys()) - already_archived_uuids
                if not new_uuids:
                    self.logger.debug('No new archived UUID to check.')
                    continue

                self.logger.info(f'Loading {index}, {len(archived_uuids)} captures in directory, {len(new_uuids)} archived UUID to check.')
                # NOTE: Only check if the directory exists if the UUID isn't in the cache.
                self.redis.hset('lookup_dirs_archived',
                                mapping={uuid: str(dirname)
                                         for uuid, dirname in archived_uuids.items()
                                         if uuid in new_uuids and dirname.exists()})
                self.redis.hdel('lookup_dirs', *archived_uuids.keys())
            else:
                index.unlink()
        self.logger.info('Archived indexes loaded')


def main():
    a = Archiver()
    a.run(sleep_in_sec=3600)


if __name__ == '__main__':
    main()
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`#!/usr/bin/env python3`

			`import csv`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`import gzip`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`import logging`
new: Logging config in file 2022-11-23 15:54:22 +01:00			`import logging.config`
chg: Reduce disk usage 2023-08-06 21:34:20 +02:00			`import os`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`import shutil`

chg: reorder imports 2021-09-07 12:59:31 +02:00			`from collections import defaultdict`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`from collections.abc import Mapping`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00			`from datetime import datetime, timedelta, date`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`from pathlib import Path`
chg: Normalize logging on the config file settings 2023-04-05 16:23:46 +02:00			`from typing import Dict, List, Optional`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`from redis import Redis`

chg: use template 2021-10-18 13:06:43 +02:00			`from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path`
			`from lookyloo.helpers import get_captures_dir`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
new: Logging config in file 2022-11-23 15:54:22 +01:00			`logging.config.dictConfig(get_config('logging'))`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00

			`class Archiver(AbstractManager):`

chg: Normalize logging on the config file settings 2023-04-05 16:23:46 +02:00			`def __init__(self, loglevel: Optional[int]=None):`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`super().__init__(loglevel)`
			`self.script_name = 'archiver'`
fix: no decoding in archiver, catch exception when requesting hashes on broken capture 2023-03-16 14:47:24 +01:00			`self.redis = Redis(unix_socket_path=get_socket_path('cache'))`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00
			`# make sure archived captures dir exists`
chg: cleanup in the mail lookyloo class 2021-08-24 18:32:54 +02:00			`self.archived_captures_dir = get_homedir() / 'archived_captures'`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`self.archived_captures_dir.mkdir(parents=True, exist_ok=True)`

chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._load_indexes()`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`def _to_run_forever(self):`
			`self._archive()`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._update_all_capture_indexes()`
			`self._load_indexes()`
chg: Compress HARs by default, update codebase accordingly 2023-08-11 13:16:59 +02:00			`# The HARs are supposedly all compressed so this call shouldn't be required`
			`# unless you're processing old captures for the first time.`
			`# self._compress_hars()`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
			`def _update_index(self, root_dir: Path) -> None:`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`current_index: Dict[str, str] = {}`
chg: Reduce disk usage 2023-08-06 21:34:20 +02:00			`if not os.listdir(root_dir):`
			`# the directory is empty, we can safely remove it`
			`root_dir.rmdir()`
			`return`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`self.logger.debug(f'Updating index for {root_dir}')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`index_file = root_dir / 'index'`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`existing_captures_names = {existing_capture.name for existing_capture in index_file.parent.iterdir()`
			`if existing_capture.name != 'index'}`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`if index_file.exists():`
			`# Skip index if the directory has been archived.`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`try:`
			`with index_file.open('r') as _f:`
chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			`current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`if uuid`
			`and dirname`
			`and dirname in existing_captures_names}`
chg: Improve logging for archiver. 2022-09-23 14:32:42 +02:00			`except Exception as e:`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`# the index file is broken, it will be recreated.`
chg: Improve logging for archiver. 2022-09-23 14:32:42 +02:00			`self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')`
chg: better handling of broken indexes in archiver 2021-11-26 18:36:35 +01:00			`pass`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`if not current_index:`
			`index_file.unlink()`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00
			`if set(current_index.values()) == existing_captures_names:`
			`# No new captures, quitting`
			`self.logger.debug(f'No new captures in {root_dir}.')`
			`return`

			`new_captures = sorted(existing_captures_names - set(current_index.values()), reverse=True)`
			`self.logger.info(f'{len(new_captures)} new captures in {root_dir}.')`

			`for capture_dir_name in new_captures:`
			`capture_dir = root_dir / capture_dir_name`
chg: Reduce disk usage 2023-08-06 21:34:20 +02:00			`if not capture_dir.is_dir():`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`self.logger.warning(f'{capture_dir} is not a directory')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`continue`
fix: Avoid exception on empty dir in archiver. 2023-08-16 11:15:00 +02:00			`if not next(capture_dir.iterdir(), None):`
			`self.logger.warning(f'{capture_dir} is empty, removing.')`
			`capture_dir.rmdir()`
			`continue`
chg: Reduce disk usage 2023-08-06 21:34:20 +02:00			`uuid_file = capture_dir / 'uuid'`
			`if not uuid_file.exists():`
			`self.logger.warning(f'No UUID file in {capture_dir}.')`
chg: Proper use of shutil.move, speedup initialization of CaptureCache 2023-08-08 12:41:21 +02:00			`shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))`
chg: Reduce disk usage 2023-08-06 21:34:20 +02:00			`continue`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`with uuid_file.open() as _f:`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`uuid = _f.read().strip()`
			`if not uuid:`
			`self.logger.warning(f'{uuid_file} is empty')`
chg: Proper use of shutil.move, speedup initialization of CaptureCache 2023-08-08 12:41:21 +02:00			`shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`continue`
			`if uuid in current_index:`
			`self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}')`
chg: Proper use of shutil.move, speedup initialization of CaptureCache 2023-08-08 12:41:21 +02:00			`shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`continue`
			`current_index[uuid] = uuid_file.parent.name`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
			`if not current_index:`
chg: out of safety, do not remove a capture dir. 2021-08-30 12:54:17 +02:00			`# The directory has been archived. It is probably safe to unlink, but`
			`# if it's not, we will lose a whole buch of captures. Moving instead for safety.`
chg: Reduce disk usage 2023-08-06 21:34:20 +02:00			`shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name))`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`return`

			`with index_file.open('w') as _f:`
			`index_writer = csv.writer(_f)`
			`for uuid, dirname in current_index.items():`
			`index_writer.writerow([uuid, dirname])`

chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			`def _make_dirs_list(self, root_dir: Path) -> List[Path]:`
			`directories = []`
			`year_now = date.today().year`
			`while True:`
			`year_dir = root_dir / str(year_now)`
			`if not year_dir.exists():`
			`# if we do not have a directory with this year, quit the loop`
			`break`
			`for month in range(12, 0, -1):`
			`month_dir = year_dir / f'{month:02}'`
			`if month_dir.exists():`
			`directories.append(month_dir)`
			`year_now -= 1`
			`return directories`

chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`def _update_all_capture_indexes(self):`
			`'''Run that after the captures are in the proper directories'''`
			`# Recent captures`
chg: Avoid captures without url(s) or document 2022-09-27 11:33:36 +02:00			`self.logger.info('Update recent indexes')`
chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			# NOTE: the call below will check the existence of every path ending with `uuid`,
			`# it is extremely inneficient as we have many hundred of thusands of them`
			`# and we only care about the rood directory (ex: 2023/06)`
			`# directories_to_index = {capture_dir.parent.parent`
			`# for capture_dir in get_captures_dir().glob('//*/uuid')}`
			`for directory_to_index in self._make_dirs_list(get_captures_dir()):`
chg: Avoid a few more disk access whenever possible. 2023-08-07 13:13:57 +02:00			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._update_index(directory_to_index)`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Recent indexes updated')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`# Archived captures`
chg: Avoid captures without url(s) or document 2022-09-27 11:33:36 +02:00			`self.logger.info('Update archives indexes')`
chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			`for directory_to_index in self._make_dirs_list(self.archived_captures_dir):`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`self._update_index(directory_to_index)`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Archived indexes updated')`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`def _archive(self):`
			`archive_interval = timedelta(days=get_config('generic', 'archive'))`
chg: Make the cut-off date for archiving the 1st of the month 2021-08-23 15:36:59 +02:00			`cut_time = (datetime.now() - archive_interval).date()`
fix: properly match cut time 2021-08-23 15:51:06 +02:00			`cut_time = cut_time.replace(day=1)`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`# Format:`
			`# { 2020: { 12: [(directory, uuid)] } }`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list))`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00			`# In order to avoid scanning the complete directory on each run, we check if year and month are`
			`# older than the cut time.`
			`for index in get_captures_dir().glob('//index'):`
			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`
			`month = int(index.parent.name)`
			`year = int(index.parent.parent.name)`
			`if date(year, month, 1) >= cut_time:`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`continue`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00
			`for capture_uuid in index.parent.glob('*/uuid'):`
			`try:`
			`timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')`
			`except ValueError:`
			`timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')`
			`if timestamp.date() >= cut_time:`
			`continue`
			`to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)`
			`self.logger.info(f'Archiving {capture_uuid.parent}.')`
chg: Better use of cache, sanity checks 2021-08-23 12:17:44 +02:00
			`if not to_archive:`
			`self.logger.info('Nothing to archive.')`
			`return`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`p = self.redis.pipeline()`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`for year, month_captures in to_archive.items():`
			`for month, captures in month_captures.items():`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00			`dest_dir.mkdir(parents=True, exist_ok=True)`
chg: Improve archiver 2023-08-20 16:21:33 +02:00			`capture_breakpoint = 1000`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`for capture_path in captures:`
chg: Improve archiver 2023-08-20 16:21:33 +02:00			`capture_breakpoint -= 1`
			`if capture_breakpoint <= 0:`
			`# Break and restart later`
			`self.logger.info('Archived many captures in {year}-{month}, will keep going later.')`
			`break`
			`elif capture_breakpoint % 100:`
			`# Just check if we requested a shutdown.`
			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`p.delete(str(capture_path))`
chg: Compress HARs by default, update codebase accordingly 2023-08-11 13:16:59 +02:00			`# If the HAR isn't archived yet, archive it before copy`
			`for har in capture_path.glob('*.har'):`
			`with har.open('rb') as f_in:`
			`with gzip.open(f'{har}.gz', 'wb') as f_out:`
			`shutil.copyfileobj(f_in, f_out)`
			`har.unlink()`
chg: Improve archiver 2023-08-20 16:21:33 +02:00			`(capture_path / 'tree.pickle').unlink(missing_ok=True)`
			`(capture_path / 'tree.pickle.gz').unlink(missing_ok=True)`
chg: Proper use of shutil.move, speedup initialization of CaptureCache 2023-08-08 12:41:21 +02:00			`shutil.move(str(capture_path), str(dest_dir))`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`p.execute()`

chg: Cleanup when dir is moved, digit months on 2 values 2021-08-23 14:53:19 +02:00			`self.logger.info('Archiving done.')`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`def _compress_hars(self):`
chg: Compress HARs by default, update codebase accordingly 2023-08-11 13:16:59 +02:00			`"""This method is very slow (it checks every single capture for non-compressed HARs)`
			`The new approach is to compress the har of every capture by default so this shouldn't be`
			`needed anymore. Keeping it here just for reference, or to process old archives that contain`
			`non-gziped HARs.`
			`"""`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Compressing archived captures')`
fix: use glob with path instead of rglob (faster)) 2023-08-04 13:15:03 +02:00			`for index in self.archived_captures_dir.glob('//index'):`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`with index.open('r') as _f:`
			`for uuid, dirname in csv.reader(_f):`
chg: Compress HARs by default, update codebase accordingly 2023-08-11 13:16:59 +02:00			`for har in (index.parent / dirname).glob('*.har'):`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00			`with har.open('rb') as f_in:`
			`with gzip.open(f'{har}.gz', 'wb') as f_out:`
			`shutil.copyfileobj(f_in, f_out)`
			`har.unlink()`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Archived captures compressed')`
new: compress HAR files in archived captures. 2022-07-12 18:44:33 +02:00
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`def _load_indexes(self):`
			`# Initialize archives`
fix: use glob with path instead of rglob (faster)) 2023-08-04 13:15:03 +02:00			`for index in get_captures_dir().glob('//index'):`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`

chg: Many improvments in archiver 2023-08-05 13:36:56 +02:00			`self.logger.info(f'Loading {index}')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`with index.open('r') as _f:`
chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			`recent_uuids: Mapping = {uuid: str(index.parent / dirname)`
			`for uuid, dirname in csv.reader(_f)`
			`if (index.parent / dirname).exists()}`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`if recent_uuids:`
chg: Many improvments in archiver 2023-08-05 13:36:56 +02:00			`self.logger.info(f'{len(recent_uuids)} captures in directory.')`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`self.redis.hset('lookup_dirs', mapping=recent_uuids)`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`else:`
			`index.unlink()`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Recent indexes loaded')`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00
chg: Many improvments in archiver 2023-08-05 13:36:56 +02:00			`already_archived_uuids = {k.decode() for k in self.redis.hkeys('lookup_dirs_archived')}`
			`self.logger.info(f'Already have {len(already_archived_uuids)} UUIDs archived')`
chg: Cleanup archiver, initialize index captures in start 2021-08-24 17:10:14 +02:00			`# Initialize archives`
chg: Many improvments in archiver 2023-08-05 13:36:56 +02:00			`for index in sorted(self.archived_captures_dir.glob('//index'), reverse=True):`
chg: Avoid directory listing as much as possible in archiver, allow shutdown 2023-08-04 14:02:45 +02:00			`if self.shutdown_requested():`
			`self.logger.warning('Shutdown requested, breaking.')`
			`break`
chg: Many improvments in archiver 2023-08-05 13:36:56 +02:00			`self.logger.debug(f'Loading {index}')`
chg: Improve storage, support both modes. 2021-08-26 15:49:19 +02:00			`with index.open('r') as _f:`
chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			`archived_uuids: Mapping = {uuid: index.parent / dirname`
			`for uuid, dirname in csv.reader(_f)}`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`if archived_uuids:`
chg: Many improvments in archiver 2023-08-05 13:36:56 +02:00			`self.logger.debug(f'{len(archived_uuids)} captures in directory.')`
			`new_uuids = set(archived_uuids.keys()) - already_archived_uuids`
			`if not new_uuids:`
			`self.logger.debug('No new archived UUID to check.')`
			`continue`

			`self.logger.info(f'Loading {index}, {len(archived_uuids)} captures in directory, {len(new_uuids)} archived UUID to check.')`
chg: Attempt to reduce disk use 2023-08-04 15:03:58 +02:00			`# NOTE: Only check if the directory exists if the UUID isn't in the cache.`
chg: Speedup indexes update 2023-08-05 20:47:08 +02:00			`self.redis.hset('lookup_dirs_archived',`
			`mapping={uuid: str(dirname)`
			`for uuid, dirname in archived_uuids.items()`
			`if uuid in new_uuids and dirname.exists()})`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.redis.hdel('lookup_dirs', *archived_uuids.keys())`
fix: unlink indexes pointing to unknown directories 2021-08-30 14:45:44 +02:00			`else:`
			`index.unlink()`
chg: Improve logging in archiver 2022-07-27 14:33:28 +02:00			`self.logger.info('Archived indexes loaded')`
chg: Force init the archived indexes 2021-08-23 15:14:08 +02:00
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00
			`def main():`
			`a = Archiver()`
chg: Make archiver an index generator 2021-08-30 12:48:13 +02:00			`a.run(sleep_in_sec=3600)`
new: Archiver, refactoring. 2021-08-20 17:46:22 +02:00

			`if __name__ == '__main__':`
			`main()`