mirror of https://github.com/CIRCL/lookyloo
				
				
				
			fix: use a more direct way to index
							parent
							
								
									7bd7488bd4
								
							
						
					
					
						commit
						d2df33aa5c
					
				|  | @ -5,8 +5,12 @@ from __future__ import annotations | |||
| import logging | ||||
| import logging.config | ||||
| 
 | ||||
| from redis import Redis | ||||
| from typing import Generator | ||||
| 
 | ||||
| from lookyloo import Lookyloo, Indexing | ||||
| from lookyloo.default import AbstractManager, get_config | ||||
| from lookyloo.capturecache import get_pickle_path | ||||
| from lookyloo.default import AbstractManager, get_config, get_socket_path | ||||
| from lookyloo.exceptions import NoValidHarFile | ||||
| 
 | ||||
| 
 | ||||
|  | @ -18,6 +22,7 @@ class BackgroundIndexer(AbstractManager): | |||
|     def __init__(self, full: bool=False, loglevel: int | None=None): | ||||
|         super().__init__(loglevel) | ||||
|         self.lookyloo = Lookyloo() | ||||
|         self.is_public_instance = get_config('generic', 'public_instance') | ||||
|         self.full_indexer = full | ||||
|         self.indexing = Indexing(full_index=self.full_indexer) | ||||
|         if self.full_indexer: | ||||
|  | @ -25,52 +30,61 @@ class BackgroundIndexer(AbstractManager): | |||
|         else: | ||||
|             self.script_name = 'background_indexer' | ||||
| 
 | ||||
|         # Redis connector so we don't use the one from Lookyloo | ||||
|         self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) | ||||
| 
 | ||||
|     def _to_run_forever(self) -> None: | ||||
|         self._check_indexes() | ||||
|         # Don't need the cache in this class. | ||||
|         self.lookyloo.clear_tree_cache() | ||||
| 
 | ||||
|     def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]: | ||||
|         # NOTE: only get the non-archived captures for now. | ||||
|         for uuid, directory in self.redis.hscan_iter('lookup_dirs'): | ||||
|             if not self.full_indexer: | ||||
|                 # If we're not running the full indexer, check if the capture should be indexed. | ||||
|                 if self.is_public_instance and self.redis.hexists(directory, 'no_index'): | ||||
|                     # Capture unindexed | ||||
|                     continue | ||||
| 
 | ||||
|             if get_pickle_path(directory) is None: | ||||
|                 # pickle isn't ready, we can't index. | ||||
|                 continue | ||||
|             indexed = self.indexing.capture_indexed(uuid) | ||||
|             if all(indexed): | ||||
|                 continue | ||||
|             yield indexed, uuid | ||||
| 
 | ||||
|     def _check_indexes(self) -> None: | ||||
|         if not self.indexing.can_index: | ||||
|             # There is no reason to run this method in multiple scripts. | ||||
|             self.logger.info('Indexing already ongoing in another process.') | ||||
|             return None | ||||
|         self.logger.info(f'Check {self.script_name}...') | ||||
|         for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False): | ||||
|             if not self.full_indexer: | ||||
|                 # If we're not running the full indexer, check if the capture should be indexed. | ||||
|                 if self.lookyloo.is_public_instance and cache.no_index: | ||||
|                     # Capture unindexed | ||||
|                     continue | ||||
|             if not cache.tree_ready: | ||||
|                 # pickle isn't ready, we can't index. | ||||
|                 continue | ||||
|             indexed = self.indexing.capture_indexed(cache.uuid) | ||||
|             if all(indexed): | ||||
|                 continue | ||||
|         for indexed, uuid_to_index in self._to_index_no_cache(): | ||||
|             try: | ||||
|                 ct = self.lookyloo.get_crawled_tree(cache.uuid) | ||||
|                 ct = self.lookyloo.get_crawled_tree(uuid_to_index) | ||||
|             except NoValidHarFile: | ||||
|                 self.logger.warning(f'Broken pickle for {cache.uuid}') | ||||
|                 self.lookyloo.remove_pickle(cache.uuid) | ||||
|                 self.logger.warning(f'Broken pickle for {uuid_to_index}') | ||||
|                 self.lookyloo.remove_pickle(uuid_to_index) | ||||
|                 continue | ||||
| 
 | ||||
|             if not indexed[0]: | ||||
|                 self.logger.info(f'Indexing urls for {cache.uuid}') | ||||
|                 self.logger.info(f'Indexing urls for {uuid_to_index}') | ||||
|                 self.indexing.index_url_capture(ct) | ||||
|             if not indexed[1]: | ||||
|                 self.logger.info(f'Indexing resources for {cache.uuid}') | ||||
|                 self.logger.info(f'Indexing resources for {uuid_to_index}') | ||||
|                 self.indexing.index_body_hashes_capture(ct) | ||||
|             if not indexed[2]: | ||||
|                 self.logger.info(f'Indexing cookies for {cache.uuid}') | ||||
|                 self.logger.info(f'Indexing cookies for {uuid_to_index}') | ||||
|                 self.indexing.index_cookies_capture(ct) | ||||
|             if not indexed[3]: | ||||
|                 self.logger.info(f'Indexing HH Hashes for {cache.uuid}') | ||||
|                 self.logger.info(f'Indexing HH Hashes for {uuid_to_index}') | ||||
|                 self.indexing.index_http_headers_hashes_capture(ct) | ||||
|             if not indexed[4]: | ||||
|                 self.logger.info(f'Indexing favicons for {cache.uuid}') | ||||
|                 favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False) | ||||
|                 self.indexing.index_favicons_capture(cache.uuid, favicons) | ||||
|                 self.logger.info(f'Indexing favicons for {uuid_to_index}') | ||||
|                 favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False) | ||||
|                 self.indexing.index_favicons_capture(uuid_to_index, favicons) | ||||
|             # NOTE: categories aren't taken in account here, should be fixed(?) | ||||
|             # see indexing.index_categories_capture(capture_uuid, categories) | ||||
|         self.indexing.indexing_done() | ||||
|  |  | |||
|  | @ -95,7 +95,7 @@ class CaptureCache(): | |||
| 
 | ||||
|     @property | ||||
|     def tree_ready(self) -> bool: | ||||
|         return bool(_pickle_path(self.capture_dir)) | ||||
|         return bool(get_pickle_path(self.capture_dir)) | ||||
| 
 | ||||
|     @property | ||||
|     def tree(self) -> CrawledTree: | ||||
|  | @ -106,7 +106,9 @@ class CaptureCache(): | |||
|         return load_pickle_tree(self.capture_dir, self.capture_dir.stat().st_mtime, self.logger) | ||||
| 
 | ||||
| 
 | ||||
| def _pickle_path(capture_dir: Path) -> Path | None: | ||||
| def get_pickle_path(capture_dir: Path | str) -> Path | None: | ||||
|     if isinstance(capture_dir, str): | ||||
|         capture_dir = Path(capture_dir) | ||||
|     pickle_file_gz = capture_dir / 'tree.pickle.gz' | ||||
|     if pickle_file_gz.exists(): | ||||
|         return pickle_file_gz | ||||
|  | @ -119,14 +121,14 @@ def _pickle_path(capture_dir: Path) -> Path | None: | |||
| 
 | ||||
| 
 | ||||
| def remove_pickle_tree(capture_dir: Path) -> None: | ||||
|     pickle_path = _pickle_path(capture_dir) | ||||
|     pickle_path = get_pickle_path(capture_dir) | ||||
|     if pickle_path and pickle_path.exists(): | ||||
|         pickle_path.unlink() | ||||
| 
 | ||||
| 
 | ||||
| @lru_cache(maxsize=64) | ||||
| def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> CrawledTree: | ||||
|     pickle_path = _pickle_path(capture_dir) | ||||
|     pickle_path = get_pickle_path(capture_dir) | ||||
|     tree = None | ||||
|     try: | ||||
|         if pickle_path: | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Raphaël Vinot
						Raphaël Vinot