lookyloo/bin/background_indexer.py

#!/usr/bin/env python3

from __future__ import annotations

import logging
import logging.config

from redis import Redis
from typing import Generator

from lookyloo import Lookyloo, Indexing
from lookyloo.capturecache import get_pickle_path
from lookyloo.default import AbstractManager, get_config, get_socket_path
from lookyloo.exceptions import NoValidHarFile


logging.config.dictConfig(get_config('logging'))


class BackgroundIndexer(AbstractManager):

    def __init__(self, full: bool=False, loglevel: int | None=None):
        super().__init__(loglevel)
        self.lookyloo = Lookyloo(cache_max_size=1)
        self.is_public_instance = get_config('generic', 'public_instance')
        self.full_indexer = full
        self.indexing = Indexing(full_index=self.full_indexer)
        if self.full_indexer:
            self.script_name = 'background_full_indexer'
        else:
            self.script_name = 'background_indexer'

        # Redis connector so we don't use the one from Lookyloo
        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)

    def _to_run_forever(self) -> None:
        self._check_indexes()
        # Don't need the cache in this class.
        self.lookyloo.clear_tree_cache()

    def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:
        # NOTE: only get the non-archived captures for now.
        for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
            if not self.full_indexer:
                # If we're not running the full indexer, check if the capture should be indexed.
                if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
                    # Capture unindexed
                    continue

            if get_pickle_path(directory) is None:
                # pickle isn't ready, we can't index.
                continue
            indexed = self.indexing.capture_indexed(uuid)
            if all(indexed):
                continue
            yield indexed, uuid

    def _check_indexes(self) -> None:
        if not self.indexing.can_index:
            # There is no reason to run this method in multiple scripts.
            self.logger.info('Indexing already ongoing in another process.')
            return None
        self.logger.info(f'Check {self.script_name}...')
        for indexed, uuid_to_index in self._to_index_no_cache():
            try:
                ct = self.lookyloo.get_crawled_tree(uuid_to_index)
            except NoValidHarFile:
                self.logger.warning(f'Broken pickle for {uuid_to_index}')
                self.lookyloo.remove_pickle(uuid_to_index)
                continue

            if not indexed[0]:
                self.logger.info(f'Indexing urls for {uuid_to_index}')
                self.indexing.index_url_capture(ct)
            if not indexed[1]:
                self.logger.info(f'Indexing resources for {uuid_to_index}')
                self.indexing.index_body_hashes_capture(ct)
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {uuid_to_index}')
                self.indexing.index_cookies_capture(ct)
            if not indexed[3]:
                self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
                self.indexing.index_http_headers_hashes_capture(ct)
            if not indexed[4]:
                self.logger.info(f'Indexing favicons for {uuid_to_index}')
                favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
                self.indexing.index_favicons_capture(uuid_to_index, favicons)
            if not indexed[5]:
                self.logger.info(f'Indexing identifiers for {uuid_to_index}')
                self.indexing.index_identifiers_capture(ct)
            # NOTE: categories aren't taken in account here, should be fixed(?)
            # see indexing.index_categories_capture(capture_uuid, categories)
        self.indexing.indexing_done()
        self.logger.info('... done.')


def main() -> None:
    i = BackgroundIndexer()
    i.run(sleep_in_sec=60)


def main_full_indexer() -> None:
    if not get_config('generic', 'index_everything'):
        raise Exception('Full indexer is disabled.')
    # NOTE: for now, it only indexes the captures that aren't archived.
    #       we will change that later, but for now, it's a good start.
    i = BackgroundIndexer(full=True)
    i.run(sleep_in_sec=60)


if __name__ == '__main__':
    main()
new: background indexer 2021-03-12 16:53:00 +01:00			`#!/usr/bin/env python3`

chg: Use new annotations 2024-01-12 17:15:41 +01:00			`from __future__ import annotations`

new: background indexer 2021-03-12 16:53:00 +01:00			`import logging`
new: Logging config in file 2022-11-23 15:54:22 +01:00			`import logging.config`
new: background indexer 2021-03-12 16:53:00 +01:00
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`from redis import Redis`
			`from typing import Generator`

new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`from lookyloo import Lookyloo, Indexing`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`from lookyloo.capturecache import get_pickle_path`
			`from lookyloo.default import AbstractManager, get_config, get_socket_path`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`from lookyloo.exceptions import NoValidHarFile`
new: background indexer 2021-03-12 16:53:00 +01:00
new: Logging config in file 2022-11-23 15:54:22 +01:00
			`logging.config.dictConfig(get_config('logging'))`
new: background indexer 2021-03-12 16:53:00 +01:00

			`class BackgroundIndexer(AbstractManager):`

new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`def __init__(self, full: bool=False, loglevel: int \| None=None):`
new: background indexer 2021-03-12 16:53:00 +01:00			`super().__init__(loglevel)`
chg: Disable index cache for backgroupd processes 2024-03-12 12:02:10 +01:00			`self.lookyloo = Lookyloo(cache_max_size=1)`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.is_public_instance = get_config('generic', 'public_instance')`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self.full_indexer = full`
			`self.indexing = Indexing(full_index=self.full_indexer)`
			`if self.full_indexer:`
			`self.script_name = 'background_full_indexer'`
			`else:`
			`self.script_name = 'background_indexer'`
new: background indexer 2021-03-12 16:53:00 +01:00
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`# Redis connector so we don't use the one from Lookyloo`
			`self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)`

chg: Use new annotations 2024-01-12 17:15:41 +01:00			`def _to_run_forever(self) -> None:`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self._check_indexes()`
chg: clear cache when it is not needed 2024-03-08 15:50:47 +01:00			`# Don't need the cache in this class.`
			`self.lookyloo.clear_tree_cache()`
new: background indexer 2021-03-12 16:53:00 +01:00
new: Index and views for identifiers 2024-03-14 00:56:28 +01:00			`def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`# NOTE: only get the non-archived captures for now.`
			`for uuid, directory in self.redis.hscan_iter('lookup_dirs'):`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`if not self.full_indexer:`
			`# If we're not running the full indexer, check if the capture should be indexed.`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`if self.is_public_instance and self.redis.hexists(directory, 'no_index'):`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`# Capture unindexed`
			`continue`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00
			`if get_pickle_path(directory) is None:`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`# pickle isn't ready, we can't index.`
new: background indexer 2021-03-12 16:53:00 +01:00			`continue`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`indexed = self.indexing.capture_indexed(uuid)`
new: background indexer 2021-03-12 16:53:00 +01:00			`if all(indexed):`
			`continue`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`yield indexed, uuid`

			`def _check_indexes(self) -> None:`
			`if not self.indexing.can_index:`
			`# There is no reason to run this method in multiple scripts.`
			`self.logger.info('Indexing already ongoing in another process.')`
			`return None`
			`self.logger.info(f'Check {self.script_name}...')`
			`for indexed, uuid_to_index in self._to_index_no_cache():`
new: background indexer 2021-03-12 16:53:00 +01:00			`try:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`ct = self.lookyloo.get_crawled_tree(uuid_to_index)`
new: background indexer 2021-03-12 16:53:00 +01:00			`except NoValidHarFile:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.logger.warning(f'Broken pickle for {uuid_to_index}')`
			`self.lookyloo.remove_pickle(uuid_to_index)`
new: background indexer 2021-03-12 16:53:00 +01:00			`continue`

			`if not indexed[0]:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.logger.info(f'Indexing urls for {uuid_to_index}')`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self.indexing.index_url_capture(ct)`
new: background indexer 2021-03-12 16:53:00 +01:00			`if not indexed[1]:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.logger.info(f'Indexing resources for {uuid_to_index}')`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self.indexing.index_body_hashes_capture(ct)`
new: background indexer 2021-03-12 16:53:00 +01:00			`if not indexed[2]:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.logger.info(f'Indexing cookies for {uuid_to_index}')`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self.indexing.index_cookies_capture(ct)`
new: Basic support for HHHash 2023-07-21 15:48:20 +02:00			`if not indexed[3]:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self.indexing.index_http_headers_hashes_capture(ct)`
new: Add favicons in indexer 2024-02-19 16:15:52 +01:00			`if not indexed[4]:`
fix: use a more direct way to index 2024-03-09 15:33:10 +01:00			`self.logger.info(f'Indexing favicons for {uuid_to_index}')`
			`favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)`
			`self.indexing.index_favicons_capture(uuid_to_index, favicons)`
new: Index and views for identifiers 2024-03-14 00:56:28 +01:00			`if not indexed[5]:`
			`self.logger.info(f'Indexing identifiers for {uuid_to_index}')`
			`self.indexing.index_identifiers_capture(ct)`
chg: Improve tree creation and cache 2021-09-22 17:09:04 +02:00			`# NOTE: categories aren't taken in account here, should be fixed(?)`
			`# see indexing.index_categories_capture(capture_uuid, categories)`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`self.indexing.indexing_done()`
new: Shodan MM3H indexing 2024-02-26 17:07:23 +01:00			`self.logger.info('... done.')`

new: background indexer 2021-03-12 16:53:00 +01:00
chg: Use new annotations 2024-01-12 17:15:41 +01:00			`def main() -> None:`
new: background indexer 2021-03-12 16:53:00 +01:00			`i = BackgroundIndexer()`
			`i.run(sleep_in_sec=60)`


new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`def main_full_indexer() -> None:`
			`if not get_config('generic', 'index_everything'):`
			`raise Exception('Full indexer is disabled.')`
chg: Bump deps 2024-03-08 10:36:04 +01:00			`# NOTE: for now, it only indexes the captures that aren't archived.`
			`# we will change that later, but for now, it's a good start.`
new: Indexer for all the captures 2024-03-05 20:51:21 +01:00			`i = BackgroundIndexer(full=True)`
			`i.run(sleep_in_sec=60)`


new: background indexer 2021-03-12 16:53:00 +01:00			`if __name__ == '__main__':`
			`main()`