2021-03-12 16:53:00 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2021-09-24 12:02:28 +02:00
|
|
|
import os
|
2022-04-08 14:28:06 +02:00
|
|
|
import shutil
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2023-07-25 17:08:00 +02:00
|
|
|
from datetime import datetime
|
2023-04-05 16:23:46 +02:00
|
|
|
from typing import Optional
|
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
from lookyloo.default import AbstractManager, get_config
|
2021-09-07 12:59:31 +02:00
|
|
|
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
2021-03-12 16:53:00 +01:00
|
|
|
from lookyloo.lookyloo import Lookyloo
|
2023-07-25 17:08:00 +02:00
|
|
|
from lookyloo.helpers import is_locked, try_make_file
|
2021-03-12 16:53:00 +01:00
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
|
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2021-03-12 16:53:00 +01:00
|
|
|
|
|
|
|
|
|
|
|
class BackgroundIndexer(AbstractManager):
|
|
|
|
|
2023-04-05 16:23:46 +02:00
|
|
|
def __init__(self, loglevel: Optional[int]=None):
|
2021-03-12 16:53:00 +01:00
|
|
|
super().__init__(loglevel)
|
|
|
|
self.lookyloo = Lookyloo()
|
2021-04-09 14:33:40 +02:00
|
|
|
self.script_name = 'background_indexer'
|
2021-03-12 16:53:00 +01:00
|
|
|
# make sure discarded captures dir exists
|
|
|
|
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
|
|
|
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
def _to_run_forever(self):
|
|
|
|
self._build_missing_pickles()
|
|
|
|
self._check_indexes()
|
2021-09-24 12:02:28 +02:00
|
|
|
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
2021-03-12 16:53:00 +01:00
|
|
|
|
|
|
|
def _build_missing_pickles(self):
|
2021-08-26 15:49:19 +02:00
|
|
|
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
|
2022-09-28 11:23:44 +02:00
|
|
|
if ((uuid_path.parent / 'tree.pickle').exists()
|
|
|
|
or not list(uuid_path.parent.rglob('*.har'))
|
|
|
|
or not list(uuid_path.parent.rglob('*.har.gz'))):
|
2021-03-12 16:53:00 +01:00
|
|
|
continue
|
2023-02-26 17:20:17 +01:00
|
|
|
|
2023-07-25 17:08:00 +02:00
|
|
|
lock_file = uuid_path.parent / 'lock'
|
|
|
|
if try_make_file(lock_file):
|
|
|
|
# Lock created, we can process
|
|
|
|
with lock_file.open('w') as f:
|
|
|
|
f.write(f"{datetime.now().isoformat()};{os.getpid()}")
|
|
|
|
elif is_locked(uuid_path.parent):
|
|
|
|
# it is really locked
|
2021-05-27 03:25:06 +02:00
|
|
|
continue
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
with uuid_path.open() as f:
|
|
|
|
uuid = f.read()
|
2021-08-20 17:46:22 +02:00
|
|
|
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
|
|
|
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
|
|
|
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
|
|
|
|
2021-03-12 16:53:00 +01:00
|
|
|
try:
|
|
|
|
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
|
|
|
self.lookyloo.get_crawled_tree(uuid)
|
2021-05-20 00:12:35 +02:00
|
|
|
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
2021-03-20 21:54:46 +01:00
|
|
|
self.logger.info(f'Pickle for {uuid} build.')
|
2021-07-14 11:34:10 +02:00
|
|
|
except MissingUUID:
|
2021-08-20 17:46:22 +02:00
|
|
|
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
2022-09-23 14:46:19 +02:00
|
|
|
except NoValidHarFile as e:
|
2022-09-26 14:58:30 +02:00
|
|
|
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
|
|
|
|
except Exception as e:
|
2022-09-23 14:46:19 +02:00
|
|
|
self.logger.critical(f'Unable to build pickle for {uuid}: {uuid_path.parent.name} - {e}')
|
2021-03-12 16:53:00 +01:00
|
|
|
# The capture is not working, moving it away.
|
|
|
|
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
2022-04-10 12:43:56 +02:00
|
|
|
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
2021-03-12 16:53:00 +01:00
|
|
|
|
|
|
|
def _check_indexes(self):
|
2021-08-18 18:01:04 +02:00
|
|
|
index_redis = self.lookyloo.indexing.redis
|
2022-08-12 01:08:28 +02:00
|
|
|
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
|
2021-03-12 16:53:00 +01:00
|
|
|
if self.lookyloo.is_public_instance and cache.no_index:
|
|
|
|
# Capture unindexed
|
|
|
|
continue
|
2021-08-18 18:01:04 +02:00
|
|
|
p = index_redis.pipeline()
|
2021-03-20 01:13:37 +01:00
|
|
|
p.sismember('indexed_urls', cache.uuid)
|
|
|
|
p.sismember('indexed_body_hashes', cache.uuid)
|
|
|
|
p.sismember('indexed_cookies', cache.uuid)
|
2023-07-21 15:48:20 +02:00
|
|
|
p.sismember('indexed_hhhashes', cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
indexed = p.execute()
|
|
|
|
if all(indexed):
|
|
|
|
continue
|
|
|
|
try:
|
2021-03-20 01:13:37 +01:00
|
|
|
ct = self.lookyloo.get_crawled_tree(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
except NoValidHarFile:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.warning(f'Broken pickle for {cache.uuid}')
|
|
|
|
self.lookyloo.remove_pickle(cache.uuid)
|
2021-03-12 16:53:00 +01:00
|
|
|
continue
|
|
|
|
|
|
|
|
if not indexed[0]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing urls for {cache.uuid}')
|
2021-03-12 16:53:00 +01:00
|
|
|
self.lookyloo.indexing.index_url_capture(ct)
|
|
|
|
if not indexed[1]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing resources for {cache.uuid}')
|
2021-03-12 16:53:00 +01:00
|
|
|
self.lookyloo.indexing.index_body_hashes_capture(ct)
|
|
|
|
if not indexed[2]:
|
2021-03-20 01:13:37 +01:00
|
|
|
self.logger.info(f'Indexing cookies for {cache.uuid}')
|
2021-03-12 16:53:00 +01:00
|
|
|
self.lookyloo.indexing.index_cookies_capture(ct)
|
2023-07-21 15:48:20 +02:00
|
|
|
if not indexed[3]:
|
|
|
|
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
|
|
|
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
|
2021-09-22 17:09:04 +02:00
|
|
|
# NOTE: categories aren't taken in account here, should be fixed(?)
|
|
|
|
# see indexing.index_categories_capture(capture_uuid, categories)
|
2021-03-12 16:53:00 +01:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
i = BackgroundIndexer()
|
|
|
|
i.run(sleep_in_sec=60)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|