lookyloo/bin/background_indexer.py

#!/usr/bin/env python3

import logging
import logging.config
import os
import shutil

from datetime import date, datetime, timedelta
from pathlib import Path
from typing import Optional, List

from lookyloo.default import AbstractManager, get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.lookyloo import Lookyloo
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk


logging.config.dictConfig(get_config('logging'))


class BackgroundIndexer(AbstractManager):

    def __init__(self, loglevel: Optional[int]=None):
        super().__init__(loglevel)
        self.lookyloo = Lookyloo()
        self.script_name = 'background_indexer'
        # make sure discarded captures dir exists
        self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
        self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)

    def _to_run_forever(self):
        all_done = self._build_missing_pickles()
        if all_done:
            self._check_indexes()
        self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)

    def _make_dirs_list(self, root_dir: Path) -> List[Path]:
        directories = []
        year_now = date.today().year
        while True:
            year_dir = root_dir / str(year_now)
            if not year_dir.exists():
                # if we do not have a directory with this year, quit the loop
                break
            for month in range(12, 0, -1):
                month_dir = year_dir / f'{month:02}'
                if month_dir.exists():
                    directories.append(month_dir)
            year_now -= 1
        return directories

    def _build_missing_pickles(self) -> bool:
        self.logger.debug('Build missing pickles...')
        # Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
        # This value makes sure we break out of the loop and build pickles of the most recent captures
        max_captures = 50
        got_new_captures = False

        # Initialize time where we do not want to build the pickles anymore.
        archive_interval = timedelta(days=get_config('generic', 'archive'))
        cut_time = (datetime.now() - archive_interval).date()
        for month_dir in self._make_dirs_list(self.lookyloo.capture_dir):
            for capture_time, path in get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True):
                if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
                    # We already have a pickle file
                    self.logger.debug(f'{path} has a pickle.')
                    continue
                if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
                    # No HAR file
                    self.logger.debug(f'{path} has no HAR file.')
                    continue

                if is_locked(path):
                    # it is really locked
                    self.logger.debug(f'{path} is locked, pickle generated by another process.')
                    continue

                with (path / 'uuid').open() as f:
                    uuid = f.read()

                if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
                    # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
                    self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
                else:
                    cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
                    if cached_path != path:
                        # we have a duplicate UUID, it is proably related to some bad copy/paste
                        if cached_path.exists():
                            # Both paths exist, move the one that isn't in lookup_dirs
                            self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
                            try:
                                shutil.move(str(path), str(self.discarded_captures_dir / path.name))
                            except FileNotFoundError as e:
                                self.logger.warning(f'Unable to move capture: {e}')
                            continue
                        else:
                            # The path in lookup_dirs for that UUID doesn't exists, just update it.
                            self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))

                try:
                    self.logger.info(f'Build pickle for {uuid}: {path.name}')
                    self.lookyloo.get_crawled_tree(uuid)
                    self.lookyloo.trigger_modules(uuid, auto_trigger=True)
                    self.logger.info(f'Pickle for {uuid} build.')
                    got_new_captures = True
                    max_captures -= 1
                except MissingUUID:
                    self.logger.warning(f'Unable to find {uuid}. That should not happen.')
                except NoValidHarFile as e:
                    self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
                except FileNotFoundError:
                    self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
                except Exception:
                    self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
                    # The capture is not working, moving it away.
                    try:
                        shutil.move(str(path), str(self.discarded_captures_dir / path.name))
                        self.lookyloo.redis.hdel('lookup_dirs', uuid)
                    except FileNotFoundError as e:
                        self.logger.warning(f'Unable to move capture: {e}')
                        continue
                if max_captures <= 0:
                    self.logger.info('Too many captures in the backlog, start from the beginning.')
                    return False
        if got_new_captures:
            self.logger.info('Finished building all missing pickles.')
            # Only return True if we built new pickles.
            return True
        return False

    def _check_indexes(self):
        index_redis = self.lookyloo.indexing.redis
        can_index = index_redis.set('ongoing_indexing', 1, ex=300, nx=True)
        if not can_index:
            # There is no reason to run this method in multiple scripts.
            self.logger.info('Indexing already ongoing in another process.')
            return
        self.logger.info('Check indexes...')
        for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
            if self.lookyloo.is_public_instance and cache.no_index:
                # Capture unindexed
                continue
            p = index_redis.pipeline()
            p.sismember('indexed_urls', cache.uuid)
            p.sismember('indexed_body_hashes', cache.uuid)
            p.sismember('indexed_cookies', cache.uuid)
            p.sismember('indexed_hhhashes', cache.uuid)
            indexed = p.execute()
            if all(indexed):
                continue
            try:
                ct = self.lookyloo.get_crawled_tree(cache.uuid)
            except NoValidHarFile:
                self.logger.warning(f'Broken pickle for {cache.uuid}')
                self.lookyloo.remove_pickle(cache.uuid)
                continue

            if not indexed[0]:
                self.logger.info(f'Indexing urls for {cache.uuid}')
                self.lookyloo.indexing.index_url_capture(ct)
            if not indexed[1]:
                self.logger.info(f'Indexing resources for {cache.uuid}')
                self.lookyloo.indexing.index_body_hashes_capture(ct)
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {cache.uuid}')
                self.lookyloo.indexing.index_cookies_capture(ct)
            if not indexed[3]:
                self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
                self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
            # NOTE: categories aren't taken in account here, should be fixed(?)
            # see indexing.index_categories_capture(capture_uuid, categories)
        index_redis.delete('ongoing_indexing')
        self.logger.info('... done.')


def main():
    i = BackgroundIndexer()
    i.run(sleep_in_sec=60)


if __name__ == '__main__':
    main()