mirror of https://github.com/CIRCL/lookyloo
				
				
				
			new: background indexer
							parent
							
								
									46aea0fe3a
								
							
						
					
					
						commit
						b3541e0e78
					
				|  | @ -0,0 +1,85 @@ | |||
| #!/usr/bin/env python3 | ||||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| import logging | ||||
| 
 | ||||
| from lookyloo.abstractmanager import AbstractManager | ||||
| from lookyloo.helpers import set_running, unset_running | ||||
| from lookyloo.lookyloo import Lookyloo | ||||
| from lookyloo.exceptions import NoValidHarFile | ||||
| 
 | ||||
| logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', | ||||
|                     level=logging.INFO, datefmt='%I:%M:%S') | ||||
| 
 | ||||
| 
 | ||||
| class BackgroundIndexer(AbstractManager): | ||||
| 
 | ||||
|     def __init__(self, loglevel: int=logging.INFO): | ||||
|         super().__init__(loglevel) | ||||
|         self.lookyloo = Lookyloo() | ||||
|         # make sure discarded captures dir exists | ||||
|         self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' | ||||
|         self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) | ||||
| 
 | ||||
|     def _to_run_forever(self): | ||||
|         set_running('background_indexer') | ||||
|         self._build_missing_pickles() | ||||
|         self._check_indexes() | ||||
|         unset_running('background_indexer') | ||||
| 
 | ||||
|     def _build_missing_pickles(self): | ||||
|         for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'): | ||||
|             if (uuid_path.parent / 'tree.pickle').exists(): | ||||
|                 continue | ||||
|             with uuid_path.open() as f: | ||||
|                 uuid = f.read() | ||||
|             try: | ||||
|                 self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') | ||||
|                 self.lookyloo.get_crawled_tree(uuid) | ||||
|             except NoValidHarFile: | ||||
|                 self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') | ||||
|                 # The capture is not working, moving it away. | ||||
|                 self.lookyloo.redis.hdel('lookup_dirs', uuid) | ||||
|                 uuid_path.parent.rename(self.discarded_captures_dir / uuid_path.parent.name) | ||||
| 
 | ||||
|     def _check_indexes(self): | ||||
|         for uuid in self.lookyloo.capture_uuids: | ||||
|             cache = self.lookyloo.capture_cache(uuid) | ||||
|             if not cache: | ||||
|                 # Shouldn't happen, but ignore in this process | ||||
|                 continue | ||||
|             if self.lookyloo.is_public_instance and cache.no_index: | ||||
|                 # Capture unindexed | ||||
|                 continue | ||||
|             p = self.lookyloo.indexing.redis.pipeline() | ||||
|             p.sismember('indexed_urls', uuid) | ||||
|             p.sismember('indexed_body_hashes', uuid) | ||||
|             p.sismember('indexed_cookies', uuid) | ||||
|             indexed = p.execute() | ||||
|             if all(indexed): | ||||
|                 continue | ||||
|             try: | ||||
|                 ct = self.lookyloo.get_crawled_tree(uuid) | ||||
|             except NoValidHarFile: | ||||
|                 self.logger.warning(f'Broken pickle for {uuid}') | ||||
|                 self.lookyloo.remove_pickle(uuid) | ||||
|                 continue | ||||
| 
 | ||||
|             if not indexed[0]: | ||||
|                 self.logger.info(f'Indexing urls for {uuid}') | ||||
|                 self.lookyloo.indexing.index_url_capture(ct) | ||||
|             if not indexed[1]: | ||||
|                 self.logger.info(f'Indexing resources for {uuid}') | ||||
|                 self.lookyloo.indexing.index_body_hashes_capture(ct) | ||||
|             if not indexed[2]: | ||||
|                 self.logger.info(f'Indexing cookies for {uuid}') | ||||
|                 self.lookyloo.indexing.index_cookies_capture(ct) | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     i = BackgroundIndexer() | ||||
|     i.run(sleep_in_sec=60) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|  | @ -28,6 +28,7 @@ shutdown = "bin.shutdown:main" | |||
| stop = "bin.stop:main" | ||||
| rebuild_caches = "bin.rebuild_caches:main" | ||||
| update = "bin.update:main" | ||||
| background_indexer = "bin.background_indexer:main" | ||||
| 
 | ||||
| 
 | ||||
| [tool.poetry.dependencies] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Raphaël Vinot
						Raphaël Vinot