diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 29087aff..e6207635 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -61,11 +61,14 @@ class Indexing(): def index_cookies_capture(self, capture_dir: Path) -> None: print(f'Index cookies {capture_dir}') try: - crawled_tree = Lookyloo.get_crawled_tree(capture_dir) + crawled_tree = load_pickle_tree(capture_dir) except Exception as e: print(e) return + if not crawled_tree: + return + if self.redis.sismember('indexed_cookies', crawled_tree.uuid): # Do not reindex return @@ -115,11 +118,14 @@ class Indexing(): def index_body_hashes_capture(self, capture_dir: Path) -> None: print(f'Index body hashes {capture_dir}') try: - crawled_tree = Lookyloo.get_crawled_tree(capture_dir) + crawled_tree = load_pickle_tree(capture_dir) except Exception as e: print(e) return + if not crawled_tree: + return + if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): # Do not reindex return @@ -156,6 +162,7 @@ class Lookyloo(): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.configs: Dict[str, Dict[str, Any]] = load_configs() self.logger.setLevel(self.get_config('loglevel')) + self.indexing = Indexing() self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir: Path = get_homedir() / 'scraped' @@ -261,6 +268,12 @@ class Lookyloo(): ct = self.get_crawled_tree(capture_uuid) return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta + def remove_pickle(self, capture_uuid: str) -> None: + capture_dir = self.lookup_capture_dir(capture_uuid) + if not capture_dir: + raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + remove_pickle_tree(capture_dir) + def rebuild_cache(self) -> None: self.redis.flushdb() self._init_existing_dumps() @@ -681,24 +694,22 @@ class Lookyloo(): return perma_uuid def get_body_hash_investigator(self, body_hash: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]: - indexing = Indexing() captures = [] - for capture_uuid, url_uuid, url_hostname in indexing.get_body_hash_captures(body_hash): - cache = self.get_capture_cache(capture_uuid) + for capture_uuid, url_uuid, url_hostname in self.indexing.get_body_hash_captures(body_hash): + cache = self.capture_cache(capture_uuid) if cache: captures.append((capture_uuid, cache['title'])) - domains = indexing.get_body_hash_domains(body_hash) + domains = self.indexing.get_body_hash_domains(body_hash) return captures, domains def get_cookie_name_investigator(self, cookie_name: str): - indexing = Indexing() captures = [] - for capture_uuid, url_uuid in indexing.get_cookies_names_captures(cookie_name): - cache = self.get_capture_cache(capture_uuid) + for capture_uuid, url_uuid in self.indexing.get_cookies_names_captures(cookie_name): + cache = self.capture_cache(capture_uuid) if cache: captures.append((capture_uuid, cache['title'])) - domains = [(domain, freq, indexing.cookies_names_domains_values(cookie_name, domain)) - for domain, freq in indexing.get_cookie_domains(cookie_name)] + domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain)) + for domain, freq in self.indexing.get_cookie_domains(cookie_name)] return captures, domains def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]: @@ -738,15 +749,13 @@ class Lookyloo(): if not url.empty_response: # Index lookup - # NOTE: We probably don't want to leave it there. - indexing = Indexing() - freq = indexing.body_hash_fequency(url.body_hash) + freq = self.indexing.body_hash_fequency(url.body_hash) if freq['hash_freq'] > 1: to_append['body_hash_details'] = freq captures_list: List[Tuple[str, str, str, str]] = [] - for capture_uuid, url_uuid, url_hostname in indexing.get_body_hash_captures(url.body_hash, url.name): - cache = self.get_capture_cache(capture_uuid) + for capture_uuid, url_uuid, url_hostname in self.indexing.get_body_hash_captures(url.body_hash, url.name): + cache = self.capture_cache(capture_uuid) if cache: captures_list.append((capture_uuid, url_uuid, cache['title'], url_hostname)) diff --git a/website/web/__init__.py b/website/web/__init__.py index 89dc21f3..28ac6017 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -373,6 +373,7 @@ def tree(tree_uuid: str, urlnode_uuid: Optional[str]=None): user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, meta=meta, enable_mail_notification=enable_mail_notification, urlnode_uuid=urlnode_uuid) + except NoValidHarFile as e: return render_template('error.html', error_message=e)