mirror of https://github.com/CIRCL/lookyloo
new: background indexer
parent
46aea0fe3a
commit
b3541e0e78
|
@ -0,0 +1,85 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from lookyloo.abstractmanager import AbstractManager
|
||||||
|
from lookyloo.helpers import set_running, unset_running
|
||||||
|
from lookyloo.lookyloo import Lookyloo
|
||||||
|
from lookyloo.exceptions import NoValidHarFile
|
||||||
|
|
||||||
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||||
|
level=logging.INFO, datefmt='%I:%M:%S')
|
||||||
|
|
||||||
|
|
||||||
|
class BackgroundIndexer(AbstractManager):
|
||||||
|
|
||||||
|
def __init__(self, loglevel: int=logging.INFO):
|
||||||
|
super().__init__(loglevel)
|
||||||
|
self.lookyloo = Lookyloo()
|
||||||
|
# make sure discarded captures dir exists
|
||||||
|
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
||||||
|
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _to_run_forever(self):
|
||||||
|
set_running('background_indexer')
|
||||||
|
self._build_missing_pickles()
|
||||||
|
self._check_indexes()
|
||||||
|
unset_running('background_indexer')
|
||||||
|
|
||||||
|
def _build_missing_pickles(self):
|
||||||
|
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
|
||||||
|
if (uuid_path.parent / 'tree.pickle').exists():
|
||||||
|
continue
|
||||||
|
with uuid_path.open() as f:
|
||||||
|
uuid = f.read()
|
||||||
|
try:
|
||||||
|
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||||
|
self.lookyloo.get_crawled_tree(uuid)
|
||||||
|
except NoValidHarFile:
|
||||||
|
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
||||||
|
# The capture is not working, moving it away.
|
||||||
|
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||||
|
uuid_path.parent.rename(self.discarded_captures_dir / uuid_path.parent.name)
|
||||||
|
|
||||||
|
def _check_indexes(self):
|
||||||
|
for uuid in self.lookyloo.capture_uuids:
|
||||||
|
cache = self.lookyloo.capture_cache(uuid)
|
||||||
|
if not cache:
|
||||||
|
# Shouldn't happen, but ignore in this process
|
||||||
|
continue
|
||||||
|
if self.lookyloo.is_public_instance and cache.no_index:
|
||||||
|
# Capture unindexed
|
||||||
|
continue
|
||||||
|
p = self.lookyloo.indexing.redis.pipeline()
|
||||||
|
p.sismember('indexed_urls', uuid)
|
||||||
|
p.sismember('indexed_body_hashes', uuid)
|
||||||
|
p.sismember('indexed_cookies', uuid)
|
||||||
|
indexed = p.execute()
|
||||||
|
if all(indexed):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
ct = self.lookyloo.get_crawled_tree(uuid)
|
||||||
|
except NoValidHarFile:
|
||||||
|
self.logger.warning(f'Broken pickle for {uuid}')
|
||||||
|
self.lookyloo.remove_pickle(uuid)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not indexed[0]:
|
||||||
|
self.logger.info(f'Indexing urls for {uuid}')
|
||||||
|
self.lookyloo.indexing.index_url_capture(ct)
|
||||||
|
if not indexed[1]:
|
||||||
|
self.logger.info(f'Indexing resources for {uuid}')
|
||||||
|
self.lookyloo.indexing.index_body_hashes_capture(ct)
|
||||||
|
if not indexed[2]:
|
||||||
|
self.logger.info(f'Indexing cookies for {uuid}')
|
||||||
|
self.lookyloo.indexing.index_cookies_capture(ct)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
i = BackgroundIndexer()
|
||||||
|
i.run(sleep_in_sec=60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -28,6 +28,7 @@ shutdown = "bin.shutdown:main"
|
||||||
stop = "bin.stop:main"
|
stop = "bin.stop:main"
|
||||||
rebuild_caches = "bin.rebuild_caches:main"
|
rebuild_caches = "bin.rebuild_caches:main"
|
||||||
update = "bin.update:main"
|
update = "bin.update:main"
|
||||||
|
background_indexer = "bin.background_indexer:main"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
|
|
Loading…
Reference in New Issue