new: background indexer

pull/184/head
Raphaël Vinot 2021-03-12 16:53:00 +01:00
parent 46aea0fe3a
commit b3541e0e78
2 changed files with 86 additions and 0 deletions

85
bin/background_indexer.py Executable file
View File

@ -0,0 +1,85 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
from lookyloo.abstractmanager import AbstractManager
from lookyloo.helpers import set_running, unset_running
from lookyloo.lookyloo import Lookyloo
from lookyloo.exceptions import NoValidHarFile
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
class BackgroundIndexer(AbstractManager):
def __init__(self, loglevel: int=logging.INFO):
super().__init__(loglevel)
self.lookyloo = Lookyloo()
# make sure discarded captures dir exists
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
def _to_run_forever(self):
set_running('background_indexer')
self._build_missing_pickles()
self._check_indexes()
unset_running('background_indexer')
def _build_missing_pickles(self):
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
if (uuid_path.parent / 'tree.pickle').exists():
continue
with uuid_path.open() as f:
uuid = f.read()
try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
self.lookyloo.get_crawled_tree(uuid)
except NoValidHarFile:
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
# The capture is not working, moving it away.
self.lookyloo.redis.hdel('lookup_dirs', uuid)
uuid_path.parent.rename(self.discarded_captures_dir / uuid_path.parent.name)
def _check_indexes(self):
for uuid in self.lookyloo.capture_uuids:
cache = self.lookyloo.capture_cache(uuid)
if not cache:
# Shouldn't happen, but ignore in this process
continue
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
p = self.lookyloo.indexing.redis.pipeline()
p.sismember('indexed_urls', uuid)
p.sismember('indexed_body_hashes', uuid)
p.sismember('indexed_cookies', uuid)
indexed = p.execute()
if all(indexed):
continue
try:
ct = self.lookyloo.get_crawled_tree(uuid)
except NoValidHarFile:
self.logger.warning(f'Broken pickle for {uuid}')
self.lookyloo.remove_pickle(uuid)
continue
if not indexed[0]:
self.logger.info(f'Indexing urls for {uuid}')
self.lookyloo.indexing.index_url_capture(ct)
if not indexed[1]:
self.logger.info(f'Indexing resources for {uuid}')
self.lookyloo.indexing.index_body_hashes_capture(ct)
if not indexed[2]:
self.logger.info(f'Indexing cookies for {uuid}')
self.lookyloo.indexing.index_cookies_capture(ct)
def main():
i = BackgroundIndexer()
i.run(sleep_in_sec=60)
if __name__ == '__main__':
main()

View File

@ -28,6 +28,7 @@ shutdown = "bin.shutdown:main"
stop = "bin.stop:main"
rebuild_caches = "bin.rebuild_caches:main"
update = "bin.update:main"
background_indexer = "bin.background_indexer:main"
[tool.poetry.dependencies]