lookyloo/bin/background_indexer.py

155 lines
7.0 KiB
Python
Raw Normal View History

2021-03-12 16:53:00 +01:00
#!/usr/bin/env python3
import logging
2022-11-23 15:54:22 +01:00
import logging.config
import os
import shutil
2021-03-12 16:53:00 +01:00
from pathlib import Path
from typing import Optional
2023-07-25 19:29:53 +02:00
from lookyloo.default import AbstractManager, get_config
2021-09-07 12:59:31 +02:00
from lookyloo.exceptions import MissingUUID, NoValidHarFile
2021-03-12 16:53:00 +01:00
from lookyloo.lookyloo import Lookyloo
2023-07-25 17:16:59 +02:00
from lookyloo.helpers import is_locked
2021-03-12 16:53:00 +01:00
2022-11-23 15:54:22 +01:00
logging.config.dictConfig(get_config('logging'))
2021-03-12 16:53:00 +01:00
class BackgroundIndexer(AbstractManager):
def __init__(self, loglevel: Optional[int]=None):
2021-03-12 16:53:00 +01:00
super().__init__(loglevel)
self.lookyloo = Lookyloo()
self.script_name = 'background_indexer'
2021-03-12 16:53:00 +01:00
# make sure discarded captures dir exists
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
def _to_run_forever(self):
all_done = self._build_missing_pickles()
if all_done:
self._check_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
2021-03-12 16:53:00 +01:00
def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50
got_new_captures = False
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/*/*/uuid'), reverse=True):
2023-07-25 19:29:53 +02:00
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
# We already have a pickle file
self.logger.debug(f'{uuid_path.parent} has a pickle.')
continue
if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')):
# No HAR file
self.logger.debug(f'{uuid_path.parent} has no HAR file.')
continue
2023-07-25 19:29:53 +02:00
if is_locked(uuid_path.parent):
# it is really locked
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
continue
2021-03-12 16:53:00 +01:00
with uuid_path.open() as f:
uuid = f.read()
2021-08-20 17:46:22 +02:00
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
if cached_path != uuid_path.parent:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
2021-08-20 17:46:22 +02:00
2021-03-12 16:53:00 +01:00
try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
self.logger.info(f'Pickle for {uuid} build.')
got_new_captures = True
max_captures -= 1
except MissingUUID:
2021-08-20 17:46:22 +02:00
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
2021-03-12 16:53:00 +01:00
# The capture is not working, moving it away.
self.lookyloo.redis.hdel('lookup_dirs', uuid)
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
if max_captures <= 0:
self.logger.info('Too many captures in the backlog, start from the beginning.')
return False
if got_new_captures:
self.logger.info('Finished building all missing pickles.')
# Only return True if we built new pickles.
return True
return False
2021-03-12 16:53:00 +01:00
def _check_indexes(self):
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_indexing', 1, ex=300, nx=True)
if not can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Indexing already ongoing in another process.')
return
self.logger.info('Check indexes...')
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
2021-03-12 16:53:00 +01:00
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
p = index_redis.pipeline()
2021-03-20 01:13:37 +01:00
p.sismember('indexed_urls', cache.uuid)
p.sismember('indexed_body_hashes', cache.uuid)
p.sismember('indexed_cookies', cache.uuid)
2023-07-21 15:48:20 +02:00
p.sismember('indexed_hhhashes', cache.uuid)
2021-03-12 16:53:00 +01:00
indexed = p.execute()
if all(indexed):
continue
try:
2021-03-20 01:13:37 +01:00
ct = self.lookyloo.get_crawled_tree(cache.uuid)
2021-03-12 16:53:00 +01:00
except NoValidHarFile:
2021-03-20 01:13:37 +01:00
self.logger.warning(f'Broken pickle for {cache.uuid}')
self.lookyloo.remove_pickle(cache.uuid)
2021-03-12 16:53:00 +01:00
continue
if not indexed[0]:
2021-03-20 01:13:37 +01:00
self.logger.info(f'Indexing urls for {cache.uuid}')
2021-03-12 16:53:00 +01:00
self.lookyloo.indexing.index_url_capture(ct)
if not indexed[1]:
2021-03-20 01:13:37 +01:00
self.logger.info(f'Indexing resources for {cache.uuid}')
2021-03-12 16:53:00 +01:00
self.lookyloo.indexing.index_body_hashes_capture(ct)
if not indexed[2]:
2021-03-20 01:13:37 +01:00
self.logger.info(f'Indexing cookies for {cache.uuid}')
2021-03-12 16:53:00 +01:00
self.lookyloo.indexing.index_cookies_capture(ct)
2023-07-21 15:48:20 +02:00
if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
2021-09-22 17:09:04 +02:00
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
index_redis.delete('ongoing_indexing')
2023-07-25 19:29:53 +02:00
self.logger.info('... done.')
2021-03-12 16:53:00 +01:00
def main():
i = BackgroundIndexer()
i.run(sleep_in_sec=60)
if __name__ == '__main__':
main()