2021-03-12 16:53:00 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2021-09-24 12:02:28 +02:00
|
|
|
import os
|
2022-04-08 14:28:06 +02:00
|
|
|
import shutil
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2023-11-15 15:31:11 +01:00
|
|
|
from datetime import datetime, timedelta
|
2023-07-26 22:16:00 +02:00
|
|
|
from pathlib import Path
|
2023-04-05 16:23:46 +02:00
|
|
|
|
2024-01-13 01:24:32 +01:00
|
|
|
from lookyloo import Lookyloo
|
2023-07-25 19:29:53 +02:00
|
|
|
from lookyloo.default import AbstractManager, get_config
|
2021-09-07 12:59:31 +02:00
|
|
|
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
2023-11-15 15:31:11 +01:00
|
|
|
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
|
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2021-03-12 16:53:00 +01:00
|
|
|
|
|
|
|
|
|
|
|
class BackgroundIndexer(AbstractManager):
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def __init__(self, loglevel: int | None=None):
|
2021-03-12 16:53:00 +01:00
|
|
|
super().__init__(loglevel)
|
|
|
|
self.lookyloo = Lookyloo()
|
2021-04-09 14:33:40 +02:00
|
|
|
self.script_name = 'background_indexer'
|
2021-03-12 16:53:00 +01:00
|
|
|
# make sure discarded captures dir exists
|
|
|
|
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
|
|
|
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _to_run_forever(self) -> None:
|
2023-07-26 11:38:40 +02:00
|
|
|
all_done = self._build_missing_pickles()
|
|
|
|
if all_done:
|
|
|
|
self._check_indexes()
|
2024-02-26 19:09:48 +01:00
|
|
|
# Disable probabilistic indexing for now, mmh3 isn't a fuzzy hash ago.
|
|
|
|
# self._check_probabilistic_indexes()
|
2021-09-24 12:02:28 +02:00
|
|
|
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2023-07-26 11:38:40 +02:00
|
|
|
def _build_missing_pickles(self) -> bool:
|
2023-07-27 14:56:39 +02:00
|
|
|
self.logger.debug('Build missing pickles...')
|
2023-07-26 11:38:40 +02:00
|
|
|
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
|
|
|
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
|
|
|
max_captures = 50
|
2023-07-27 14:56:39 +02:00
|
|
|
got_new_captures = False
|
2023-11-15 15:31:11 +01:00
|
|
|
|
|
|
|
# Initialize time where we do not want to build the pickles anymore.
|
|
|
|
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
|
|
|
cut_time = (datetime.now() - archive_interval)
|
|
|
|
for month_dir in make_dirs_list(self.lookyloo.capture_dir):
|
2023-11-20 11:45:41 +01:00
|
|
|
__counter_shutdown = 0
|
2023-11-16 23:58:07 +01:00
|
|
|
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
|
2023-11-20 11:45:41 +01:00
|
|
|
__counter_shutdown += 1
|
|
|
|
if __counter_shutdown % 10 and self.shutdown_requested():
|
|
|
|
self.logger.warning('Shutdown requested, breaking.')
|
|
|
|
return False
|
2023-11-15 15:31:11 +01:00
|
|
|
if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
|
2023-10-09 10:26:37 +02:00
|
|
|
# We already have a pickle file
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.debug(f'{path} has a pickle.')
|
2023-10-09 10:26:37 +02:00
|
|
|
continue
|
2023-11-15 15:31:11 +01:00
|
|
|
if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
|
2023-10-09 10:26:37 +02:00
|
|
|
# No HAR file
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.debug(f'{path} has no HAR file.')
|
2023-10-09 10:26:37 +02:00
|
|
|
continue
|
|
|
|
|
2023-11-15 15:31:11 +01:00
|
|
|
if is_locked(path):
|
2023-10-09 10:26:37 +02:00
|
|
|
# it is really locked
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.debug(f'{path} is locked, pickle generated by another process.')
|
2023-09-18 00:32:36 +02:00
|
|
|
continue
|
2023-10-09 10:26:37 +02:00
|
|
|
|
2023-11-15 15:31:11 +01:00
|
|
|
with (path / 'uuid').open() as f:
|
2023-10-09 10:26:37 +02:00
|
|
|
uuid = f.read()
|
|
|
|
|
|
|
|
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
|
|
|
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
2023-11-15 15:31:11 +01:00
|
|
|
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
|
2023-10-09 10:26:37 +02:00
|
|
|
else:
|
2024-01-12 17:15:41 +01:00
|
|
|
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
|
2023-11-15 15:31:11 +01:00
|
|
|
if cached_path != path:
|
2023-10-09 10:26:37 +02:00
|
|
|
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
|
|
|
if cached_path.exists():
|
|
|
|
# Both paths exist, move the one that isn't in lookup_dirs
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
|
2023-10-09 10:26:37 +02:00
|
|
|
try:
|
2023-11-15 15:31:11 +01:00
|
|
|
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
|
2023-10-09 10:26:37 +02:00
|
|
|
except FileNotFoundError as e:
|
|
|
|
self.logger.warning(f'Unable to move capture: {e}')
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
2023-11-15 15:31:11 +01:00
|
|
|
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
|
2023-10-09 10:26:37 +02:00
|
|
|
|
|
|
|
try:
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.info(f'Build pickle for {uuid}: {path.name}')
|
2023-10-09 10:26:37 +02:00
|
|
|
self.lookyloo.get_crawled_tree(uuid)
|
|
|
|
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
2023-11-21 16:50:15 +01:00
|
|
|
self.logger.info(f'Pickle for {uuid} built.')
|
2023-10-09 10:26:37 +02:00
|
|
|
got_new_captures = True
|
|
|
|
max_captures -= 1
|
|
|
|
except MissingUUID:
|
|
|
|
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
|
|
|
except NoValidHarFile as e:
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
|
2023-10-09 10:26:37 +02:00
|
|
|
except FileNotFoundError:
|
|
|
|
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
|
|
|
|
except Exception:
|
2023-11-15 15:31:11 +01:00
|
|
|
self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
|
2023-10-09 10:26:37 +02:00
|
|
|
# The capture is not working, moving it away.
|
|
|
|
try:
|
2023-11-15 15:31:11 +01:00
|
|
|
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
|
2023-10-09 10:26:37 +02:00
|
|
|
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
|
|
|
except FileNotFoundError as e:
|
|
|
|
self.logger.warning(f'Unable to move capture: {e}')
|
|
|
|
continue
|
|
|
|
if max_captures <= 0:
|
|
|
|
self.logger.info('Too many captures in the backlog, start from the beginning.')
|
|
|
|
return False
|
2023-07-27 14:56:39 +02:00
|
|
|
if got_new_captures:
|
|
|
|
self.logger.info('Finished building all missing pickles.')
|
|
|
|
# Only return True if we built new pickles.
|
2023-07-26 11:38:40 +02:00
|
|
|
return True
|
|
|
|
return False
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _check_indexes(self) -> None:
|
2021-08-18 18:01:04 +02:00
|
|
|
index_redis = self.lookyloo.indexing.redis
|
2023-11-21 16:50:15 +01:00
|
|
|
can_index = index_redis.set('ongoing_indexing', 1, ex=3600, nx=True)
|
2023-07-26 12:37:12 +02:00
|
|
|
if not can_index:
|
|
|
|
# There is no reason to run this method in multiple scripts.
|
|
|
|
self.logger.info('Indexing already ongoing in another process.')
|
2024-01-12 17:15:41 +01:00
|
|
|
return None
|
2023-07-26 12:37:12 +02:00
|
|
|
self.logger.info('Check indexes...')
|
2022-08-12 01:08:28 +02:00
|
|
|
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
2021-03-12 16:53:00 +01:00
|
|
|
if self.lookyloo.is_public_instance and cache.no_index:
|
|
|
|
# Capture unindexed
|
|
|
|
continue
|
2021-08-18 18:01:04 +02:00
|
|
|
p = index_redis.pipeline()
|
2021-03-20 01:13:37 +01:00
|
|
|
p.sismember('indexed_urls', cache.uuid)
|
|
|
|
p.sismember('indexed_body_hashes', cache.uuid)
|
|
|
|
p.sismember('indexed_cookies', cache.uuid)
|
2023-07-21 15:48:20 +02:00
|
|
|
p.sismember('indexed_hhhashes', cache.uuid)
|
2024-02-19 16:15:52 +01:00
|
|
|
p.sismember('indexed_favicons', cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
indexed = p.execute()
|
|
|
|
if all(indexed):
|
|
|
|
continue
|
|
|
|
try:
|
2021-03-20 01:13:37 +01:00
|
|
|
ct = self.lookyloo.get_crawled_tree(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
except NoValidHarFile:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.warning(f'Broken pickle for {cache.uuid}')
|
|
|
|
self.lookyloo.remove_pickle(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
if not indexed[0]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing urls for {cache.uuid}')
|
2021-03-12 16:53:00 +01:00
|
|
|
self.lookyloo.indexing.index_url_capture(ct)
|
|
|
|
if not indexed[1]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing resources for {cache.uuid}')
|
2021-03-12 16:53:00 +01:00
|
|
|
self.lookyloo.indexing.index_body_hashes_capture(ct)
|
|
|
|
if not indexed[2]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing cookies for {cache.uuid}')
|
2021-03-12 16:53:00 +01:00
|
|
|
self.lookyloo.indexing.index_cookies_capture(ct)
|
2023-07-21 15:48:20 +02:00
|
|
|
if not indexed[3]:
|
|
|
|
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
|
|
|
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
|
2024-02-19 16:15:52 +01:00
|
|
|
if not indexed[4]:
|
|
|
|
self.logger.info(f'Indexing favicons for {cache.uuid}')
|
|
|
|
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
|
|
|
self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons)
|
2021-09-22 17:09:04 +02:00
|
|
|
# NOTE: categories aren't taken in account here, should be fixed(?)
|
|
|
|
# see indexing.index_categories_capture(capture_uuid, categories)
|
2023-07-26 12:37:12 +02:00
|
|
|
index_redis.delete('ongoing_indexing')
|
2023-07-25 19:29:53 +02:00
|
|
|
self.logger.info('... done.')
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2024-02-26 17:07:23 +01:00
|
|
|
def _check_probabilistic_indexes(self) -> None:
|
|
|
|
index_redis = self.lookyloo.indexing.redis
|
|
|
|
can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
|
|
|
|
if not can_index:
|
|
|
|
# There is no reason to run this method in multiple scripts.
|
|
|
|
self.logger.info('Probalistic indexing already ongoing in another process.')
|
|
|
|
return None
|
|
|
|
self.logger.info('Check probabilistic indexes...')
|
|
|
|
algorithms = ['mmh3-shodan']
|
|
|
|
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
|
|
|
if self.lookyloo.is_public_instance and cache.no_index:
|
|
|
|
# Capture unindexed
|
|
|
|
continue
|
|
|
|
p = index_redis.pipeline()
|
|
|
|
for algorithm in algorithms:
|
|
|
|
p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
|
|
|
|
indexed = p.execute()
|
|
|
|
if all(indexed):
|
|
|
|
continue
|
|
|
|
for i, algorithm in enumerate(algorithms):
|
|
|
|
if not indexed[i]:
|
|
|
|
self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
|
|
|
|
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
|
|
|
self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
|
|
|
|
index_redis.delete('ongoing_probalistic_indexing')
|
|
|
|
self.logger.info('... done.')
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def main() -> None:
|
2021-03-12 16:53:00 +01:00
|
|
|
i = BackgroundIndexer()
|
|
|
|
i.run(sleep_in_sec=60)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|