2021-03-12 16:53:00 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2021-09-24 12:02:28 +02:00
|
|
|
import os
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2024-03-05 20:51:21 +01:00
|
|
|
from lookyloo import Lookyloo, Indexing
|
2023-07-25 19:29:53 +02:00
|
|
|
from lookyloo.default import AbstractManager, get_config
|
2024-03-05 20:51:21 +01:00
|
|
|
from lookyloo.exceptions import NoValidHarFile
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
|
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2021-03-12 16:53:00 +01:00
|
|
|
|
|
|
|
|
|
|
|
class BackgroundIndexer(AbstractManager):
|
|
|
|
|
2024-03-05 20:51:21 +01:00
|
|
|
def __init__(self, full: bool=False, loglevel: int | None=None):
|
2021-03-12 16:53:00 +01:00
|
|
|
super().__init__(loglevel)
|
|
|
|
self.lookyloo = Lookyloo()
|
2024-03-05 20:51:21 +01:00
|
|
|
self.full_indexer = full
|
|
|
|
self.indexing = Indexing(full_index=self.full_indexer)
|
|
|
|
if self.full_indexer:
|
|
|
|
self.script_name = 'background_full_indexer'
|
|
|
|
else:
|
|
|
|
self.script_name = 'background_indexer'
|
2021-03-12 16:53:00 +01:00
|
|
|
# make sure discarded captures dir exists
|
|
|
|
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
|
|
|
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _to_run_forever(self) -> None:
|
2024-03-05 20:51:21 +01:00
|
|
|
self._check_indexes()
|
2021-09-24 12:02:28 +02:00
|
|
|
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _check_indexes(self) -> None:
|
2024-03-05 20:51:21 +01:00
|
|
|
if not self.indexing.can_index:
|
2023-07-26 12:37:12 +02:00
|
|
|
# There is no reason to run this method in multiple scripts.
|
|
|
|
self.logger.info('Indexing already ongoing in another process.')
|
2024-01-12 17:15:41 +01:00
|
|
|
return None
|
2024-03-05 20:51:21 +01:00
|
|
|
self.logger.info(f'Check {self.script_name}...')
|
2022-08-12 01:08:28 +02:00
|
|
|
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
2024-03-05 20:51:21 +01:00
|
|
|
if not self.full_indexer:
|
|
|
|
# If we're not running the full indexer, check if the capture should be indexed.
|
|
|
|
if self.lookyloo.is_public_instance and cache.no_index:
|
|
|
|
# Capture unindexed
|
|
|
|
continue
|
|
|
|
if not cache.tree_ready:
|
|
|
|
# pickle isn't ready, we can't index.
|
2021-03-12 16:53:00 +01:00
|
|
|
continue
|
2024-03-05 20:51:21 +01:00
|
|
|
indexed = self.indexing.capture_indexed(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
if all(indexed):
|
|
|
|
continue
|
|
|
|
try:
|
2021-03-20 01:13:37 +01:00
|
|
|
ct = self.lookyloo.get_crawled_tree(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
except NoValidHarFile:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.warning(f'Broken pickle for {cache.uuid}')
|
|
|
|
self.lookyloo.remove_pickle(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
if not indexed[0]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing urls for {cache.uuid}')
|
2024-03-05 20:51:21 +01:00
|
|
|
self.indexing.index_url_capture(ct)
|
2021-03-12 16:53:00 +01:00
|
|
|
if not indexed[1]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing resources for {cache.uuid}')
|
2024-03-05 20:51:21 +01:00
|
|
|
self.indexing.index_body_hashes_capture(ct)
|
2021-03-12 16:53:00 +01:00
|
|
|
if not indexed[2]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing cookies for {cache.uuid}')
|
2024-03-05 20:51:21 +01:00
|
|
|
self.indexing.index_cookies_capture(ct)
|
2023-07-21 15:48:20 +02:00
|
|
|
if not indexed[3]:
|
|
|
|
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
2024-03-05 20:51:21 +01:00
|
|
|
self.indexing.index_http_headers_hashes_capture(ct)
|
2024-02-19 16:15:52 +01:00
|
|
|
if not indexed[4]:
|
|
|
|
self.logger.info(f'Indexing favicons for {cache.uuid}')
|
|
|
|
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
2024-03-05 20:51:21 +01:00
|
|
|
self.indexing.index_favicons_capture(cache.uuid, favicons)
|
2021-09-22 17:09:04 +02:00
|
|
|
# NOTE: categories aren't taken in account here, should be fixed(?)
|
|
|
|
# see indexing.index_categories_capture(capture_uuid, categories)
|
2024-03-05 20:51:21 +01:00
|
|
|
self.indexing.indexing_done()
|
2024-02-26 17:07:23 +01:00
|
|
|
self.logger.info('... done.')
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def main() -> None:
|
2021-03-12 16:53:00 +01:00
|
|
|
i = BackgroundIndexer()
|
|
|
|
i.run(sleep_in_sec=60)
|
|
|
|
|
|
|
|
|
2024-03-05 20:51:21 +01:00
|
|
|
def main_full_indexer() -> None:
|
|
|
|
if not get_config('generic', 'index_everything'):
|
|
|
|
raise Exception('Full indexer is disabled.')
|
2024-03-08 10:36:04 +01:00
|
|
|
# NOTE: for now, it only indexes the captures that aren't archived.
|
|
|
|
# we will change that later, but for now, it's a good start.
|
2024-03-05 20:51:21 +01:00
|
|
|
i = BackgroundIndexer(full=True)
|
|
|
|
i.run(sleep_in_sec=60)
|
|
|
|
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|