lookyloo/bin/background_indexer.py

113 lines
4.4 KiB
Python
Raw Normal View History

2021-03-12 16:53:00 +01:00
#!/usr/bin/env python3
2024-01-12 17:15:41 +01:00
from __future__ import annotations
2021-03-12 16:53:00 +01:00
import logging
2022-11-23 15:54:22 +01:00
import logging.config
2021-03-12 16:53:00 +01:00
2024-03-09 15:33:10 +01:00
from redis import Redis
from typing import Generator
2024-03-05 20:51:21 +01:00
from lookyloo import Lookyloo, Indexing
2024-03-09 15:33:10 +01:00
from lookyloo.capturecache import get_pickle_path
from lookyloo.default import AbstractManager, get_config, get_socket_path
2024-03-05 20:51:21 +01:00
from lookyloo.exceptions import NoValidHarFile
2021-03-12 16:53:00 +01:00
2022-11-23 15:54:22 +01:00
logging.config.dictConfig(get_config('logging'))
2021-03-12 16:53:00 +01:00
class BackgroundIndexer(AbstractManager):
2024-03-05 20:51:21 +01:00
def __init__(self, full: bool=False, loglevel: int | None=None):
2021-03-12 16:53:00 +01:00
super().__init__(loglevel)
self.lookyloo = Lookyloo(cache_max_size=1)
2024-03-09 15:33:10 +01:00
self.is_public_instance = get_config('generic', 'public_instance')
2024-03-05 20:51:21 +01:00
self.full_indexer = full
self.indexing = Indexing(full_index=self.full_indexer)
if self.full_indexer:
self.script_name = 'background_full_indexer'
else:
self.script_name = 'background_indexer'
2021-03-12 16:53:00 +01:00
2024-03-09 15:33:10 +01:00
# Redis connector so we don't use the one from Lookyloo
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
2024-01-12 17:15:41 +01:00
def _to_run_forever(self) -> None:
2024-03-05 20:51:21 +01:00
self._check_indexes()
2024-03-08 15:50:47 +01:00
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
2021-03-12 16:53:00 +01:00
2024-03-14 00:56:28 +01:00
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:
2024-03-09 15:33:10 +01:00
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
2024-03-05 20:51:21 +01:00
if not self.full_indexer:
# If we're not running the full indexer, check if the capture should be indexed.
2024-03-09 15:33:10 +01:00
if self.is_public_instance and self.redis.hexists(directory, 'no_index'):
2024-03-05 20:51:21 +01:00
# Capture unindexed
continue
2024-03-09 15:33:10 +01:00
if get_pickle_path(directory) is None:
2024-03-05 20:51:21 +01:00
# pickle isn't ready, we can't index.
2021-03-12 16:53:00 +01:00
continue
2024-03-09 15:33:10 +01:00
indexed = self.indexing.capture_indexed(uuid)
2021-03-12 16:53:00 +01:00
if all(indexed):
continue
2024-03-09 15:33:10 +01:00
yield indexed, uuid
def _check_indexes(self) -> None:
if not self.indexing.can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.')
return None
self.logger.info(f'Check {self.script_name}...')
for indexed, uuid_to_index in self._to_index_no_cache():
2021-03-12 16:53:00 +01:00
try:
2024-03-09 15:33:10 +01:00
ct = self.lookyloo.get_crawled_tree(uuid_to_index)
2021-03-12 16:53:00 +01:00
except NoValidHarFile:
2024-03-09 15:33:10 +01:00
self.logger.warning(f'Broken pickle for {uuid_to_index}')
self.lookyloo.remove_pickle(uuid_to_index)
2021-03-12 16:53:00 +01:00
continue
if not indexed[0]:
2024-03-09 15:33:10 +01:00
self.logger.info(f'Indexing urls for {uuid_to_index}')
2024-03-05 20:51:21 +01:00
self.indexing.index_url_capture(ct)
2021-03-12 16:53:00 +01:00
if not indexed[1]:
2024-03-09 15:33:10 +01:00
self.logger.info(f'Indexing resources for {uuid_to_index}')
2024-03-05 20:51:21 +01:00
self.indexing.index_body_hashes_capture(ct)
2021-03-12 16:53:00 +01:00
if not indexed[2]:
2024-03-09 15:33:10 +01:00
self.logger.info(f'Indexing cookies for {uuid_to_index}')
2024-03-05 20:51:21 +01:00
self.indexing.index_cookies_capture(ct)
2023-07-21 15:48:20 +02:00
if not indexed[3]:
2024-03-09 15:33:10 +01:00
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
2024-03-05 20:51:21 +01:00
self.indexing.index_http_headers_hashes_capture(ct)
2024-02-19 16:15:52 +01:00
if not indexed[4]:
2024-03-09 15:33:10 +01:00
self.logger.info(f'Indexing favicons for {uuid_to_index}')
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
self.indexing.index_favicons_capture(uuid_to_index, favicons)
2024-03-14 00:56:28 +01:00
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
2021-09-22 17:09:04 +02:00
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
2024-03-05 20:51:21 +01:00
self.indexing.indexing_done()
2024-02-26 17:07:23 +01:00
self.logger.info('... done.')
2021-03-12 16:53:00 +01:00
2024-01-12 17:15:41 +01:00
def main() -> None:
2021-03-12 16:53:00 +01:00
i = BackgroundIndexer()
i.run(sleep_in_sec=60)
2024-03-05 20:51:21 +01:00
def main_full_indexer() -> None:
if not get_config('generic', 'index_everything'):
raise Exception('Full indexer is disabled.')
2024-03-08 10:36:04 +01:00
# NOTE: for now, it only indexes the captures that aren't archived.
# we will change that later, but for now, it's a good start.
2024-03-05 20:51:21 +01:00
i = BackgroundIndexer(full=True)
i.run(sleep_in_sec=60)
2021-03-12 16:53:00 +01:00
if __name__ == '__main__':
main()