mirror of https://github.com/CIRCL/lookyloo
new: background indexer
parent
46aea0fe3a
commit
b3541e0e78
|
@ -0,0 +1,85 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
|
||||
from lookyloo.abstractmanager import AbstractManager
|
||||
from lookyloo.helpers import set_running, unset_running
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
from lookyloo.exceptions import NoValidHarFile
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
level=logging.INFO, datefmt='%I:%M:%S')
|
||||
|
||||
|
||||
class BackgroundIndexer(AbstractManager):
|
||||
|
||||
def __init__(self, loglevel: int=logging.INFO):
|
||||
super().__init__(loglevel)
|
||||
self.lookyloo = Lookyloo()
|
||||
# make sure discarded captures dir exists
|
||||
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
||||
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _to_run_forever(self):
|
||||
set_running('background_indexer')
|
||||
self._build_missing_pickles()
|
||||
self._check_indexes()
|
||||
unset_running('background_indexer')
|
||||
|
||||
def _build_missing_pickles(self):
|
||||
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
|
||||
if (uuid_path.parent / 'tree.pickle').exists():
|
||||
continue
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
try:
|
||||
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
self.lookyloo.get_crawled_tree(uuid)
|
||||
except NoValidHarFile:
|
||||
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
# The capture is not working, moving it away.
|
||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||
uuid_path.parent.rename(self.discarded_captures_dir / uuid_path.parent.name)
|
||||
|
||||
def _check_indexes(self):
|
||||
for uuid in self.lookyloo.capture_uuids:
|
||||
cache = self.lookyloo.capture_cache(uuid)
|
||||
if not cache:
|
||||
# Shouldn't happen, but ignore in this process
|
||||
continue
|
||||
if self.lookyloo.is_public_instance and cache.no_index:
|
||||
# Capture unindexed
|
||||
continue
|
||||
p = self.lookyloo.indexing.redis.pipeline()
|
||||
p.sismember('indexed_urls', uuid)
|
||||
p.sismember('indexed_body_hashes', uuid)
|
||||
p.sismember('indexed_cookies', uuid)
|
||||
indexed = p.execute()
|
||||
if all(indexed):
|
||||
continue
|
||||
try:
|
||||
ct = self.lookyloo.get_crawled_tree(uuid)
|
||||
except NoValidHarFile:
|
||||
self.logger.warning(f'Broken pickle for {uuid}')
|
||||
self.lookyloo.remove_pickle(uuid)
|
||||
continue
|
||||
|
||||
if not indexed[0]:
|
||||
self.logger.info(f'Indexing urls for {uuid}')
|
||||
self.lookyloo.indexing.index_url_capture(ct)
|
||||
if not indexed[1]:
|
||||
self.logger.info(f'Indexing resources for {uuid}')
|
||||
self.lookyloo.indexing.index_body_hashes_capture(ct)
|
||||
if not indexed[2]:
|
||||
self.logger.info(f'Indexing cookies for {uuid}')
|
||||
self.lookyloo.indexing.index_cookies_capture(ct)
|
||||
|
||||
|
||||
def main():
|
||||
i = BackgroundIndexer()
|
||||
i.run(sleep_in_sec=60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -28,6 +28,7 @@ shutdown = "bin.shutdown:main"
|
|||
stop = "bin.stop:main"
|
||||
rebuild_caches = "bin.rebuild_caches:main"
|
||||
update = "bin.update:main"
|
||||
background_indexer = "bin.background_indexer:main"
|
||||
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
|
|
Loading…
Reference in New Issue