From e40796c92ecb909544678c368cb83feb476d4410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 11 Mar 2024 00:14:07 +0100 Subject: [PATCH] chg: merge index and reindex methods. --- bin/run_backend.py | 2 +- lookyloo/capturecache.py | 3 +- lookyloo/indexing.py | 178 ++++++++++++++++----------------------- 3 files changed, 76 insertions(+), 107 deletions(-) diff --git a/bin/run_backend.py b/bin/run_backend.py index 4349b12..e86ecc1 100755 --- a/bin/run_backend.py +++ b/bin/run_backend.py @@ -66,7 +66,7 @@ def shutdown_full_index(storage_directory: Path | None=None) -> None: if not storage_directory: storage_directory = get_homedir() r = Redis(unix_socket_path=get_socket_path('full_index')) - r.shutdown(save=True) + r.shutdown() print('Kvrocks full indexing database shutdown.') diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index e06330b..e83589f 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -396,8 +396,9 @@ class CapturesIndex(Mapping): # type: ignore[type-arg] logger.debug('Unable to rebuild the tree, the HAR files are broken.') except TreeNeedsRebuild: try: + logger.debug('The tree needs to be rebuilt.') tree = self._create_pickle(capture_dir, logger) - self.indexing.new_internal_uuids(tree) + self.indexing.force_reindex(uuid) except NoValidHarFile: logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.') tree = None diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 4d4656c..e335d12 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -58,6 +58,15 @@ class Indexing(): def indexing_done(self) -> None: self.redis.delete('ongoing_indexing') + def force_reindex(self, capture_uuid: str) -> None: + p = self.redis.pipeline() + p.srem('indexed_urls', capture_uuid) + p.srem('indexed_body_hashes', capture_uuid) + p.srem('indexed_cookies', capture_uuid) + p.srem('indexed_hhhashes', capture_uuid) + p.srem('indexed_favicons', capture_uuid) + p.execute() + def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]: p = self.redis.pipeline() p.sismember('indexed_urls', capture_uuid) @@ -68,18 +77,6 @@ class Indexing(): # This call for sure returns a tuple of 5 booleans return p.execute() # type: ignore[return-value] - def new_internal_uuids(self, crawled_tree: CrawledTree) -> None: - # only trigger this method if the capture was already indexed. - if self.redis.sismember('indexed_cookies', crawled_tree.uuid): - self.logger.debug(f'Cookies index: update internal UUIDs for {crawled_tree.uuid}') - self._reindex_cookies_capture(crawled_tree) - if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): - self.logger.debug(f'Body hashes index: update internal UUIDs for {crawled_tree.uuid}') - self._reindex_body_hashes_capture(crawled_tree) - if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid): - self.logger.debug(f'HTTP Headers hashes index: update internal UUIDs for {crawled_tree.uuid}') - self._reindex_http_headers_hashes_capture(crawled_tree) - # ###### Cookies ###### @property @@ -98,10 +95,18 @@ class Indexing(): def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]: return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] - def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None: + def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: + if self.redis.sismember('indexed_cookies', crawled_tree.uuid): + # Do not reindex + return + self.logger.debug(f'Indexing cookies for {crawled_tree.uuid} ... ') + self.redis.sadd('indexed_cookies', crawled_tree.uuid) + pipeline = self.redis.pipeline() already_loaded: set[tuple[str, str]] = set() + # used if we need to reindex a capture already_cleaned_up: set[str] = set() + is_reindex = False for urlnode in crawled_tree.root_hartree.url_tree.traverse(): if 'cookies_received' not in urlnode.features: continue @@ -117,52 +122,17 @@ class Indexing(): to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')] if to_remove: pipeline.srem(f'cn|{name}|captures', *to_remove) + is_reindex = True + self.logger.debug(f'reindexing cookies for {crawled_tree.uuid} ... ') already_cleaned_up.add(name) pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') + if not is_reindex: + pipeline.zincrby('cookies_names', 1, name) + pipeline.zincrby(f'cn|{name}', 1, domain) + pipeline.zincrby(f'cn|{name}|{domain}', 1, value) + pipeline.sadd(domain, name) pipeline.execute() - - def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: - if self.redis.sismember('indexed_cookies', crawled_tree.uuid): - # Do not reindex - return - self.redis.sadd('indexed_cookies', crawled_tree.uuid) - - pipeline = self.redis.pipeline() - already_loaded: set[tuple[str, str]] = set() - for urlnode in crawled_tree.root_hartree.url_tree.traverse(): - if 'cookies_received' not in urlnode.features: - continue - for domain, cookie, _ in urlnode.cookies_received: - name, value = cookie.split('=', 1) - if (name, domain) in already_loaded: - # Only add cookie name once / capture - continue - already_loaded.add((name, domain)) - pipeline.zincrby('cookies_names', 1, name) - pipeline.zincrby(f'cn|{name}', 1, domain) - pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') - pipeline.zincrby(f'cn|{name}|{domain}', 1, value) - pipeline.sadd(domain, name) - pipeline.execute() - - """ - # Not used anywhere? - def aggregate_domain_cookies(self): - psl = get_public_suffix_list() - pipeline = self.redis.pipeline() - for cn, cn_freq in self.cookies_names: - for domain, d_freq in self.get_cookie_domains(cn): - tld = psl.publicsuffix(domain) - main_domain_part = re.sub(f'.{tld}$', '', domain).split('.')[-1] - pipeline.zincrby('aggregate_domains_cn', cn_freq, f'{main_domain_part}|{cn}') - pipeline.zincrby('aggregate_cn_domains', d_freq, f'{cn}|{main_domain_part}') - pipeline.execute() - aggregate_domains_cn: List[Tuple[str, float]] = self.redis.zrevrange('aggregate_domains_cn', 0, -1, withscores=True) - aggregate_cn_domains: List[Tuple[str, float]] = self.redis.zrevrange('aggregate_cn_domains', 0, -1, withscores=True) - self.redis.delete('aggregate_domains_cn') - self.redis.delete('aggregate_cn_domains') - return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains} - """ + self.logger.debug(f'done with cookies for {crawled_tree.uuid}.') # ###### Body hashes ###### @@ -185,37 +155,35 @@ class Indexing(): to_return['hash_domains_freq'] = int(hash_domains_freq) return to_return - def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: - # if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed - cleaned_up_hashes: set[str] = set() - pipeline = self.redis.pipeline() - for urlnode in crawled_tree.root_hartree.url_tree.traverse(): - for h in urlnode.resources_hashes: - if h not in cleaned_up_hashes: - # Delete the hash for that capture the first time we see it. - pipeline.delete(f'bh|{h}|captures|{crawled_tree.uuid}') - cleaned_up_hashes.add(h) - pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, - f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') - pipeline.execute() - def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): # Do not reindex return self.redis.sadd('indexed_body_hashes', crawled_tree.uuid) + self.logger.debug(f'Indexing body hashes for {crawled_tree.uuid} ... ') + cleaned_up_hashes: set[str] = set() pipeline = self.redis.pipeline() + is_reindex = False for urlnode in crawled_tree.root_hartree.url_tree.traverse(): for h in urlnode.resources_hashes: - pipeline.zincrby('body_hashes', 1, h) - pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname) - # set of all captures with this hash - pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid) + if h not in cleaned_up_hashes: + # Delete the hash for that capture the first time we see it. + if self.redis.exists(f'bh|{h}|captures|{crawled_tree.uuid}'): + pipeline.delete(f'bh|{h}|captures|{crawled_tree.uuid}') + cleaned_up_hashes.add(h) + is_reindex = True + self.logger.debug(f'reindexing body hashes for {crawled_tree.uuid} ... ') # ZSet of all urlnode_UUIDs|full_url pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') + if not is_reindex: + pipeline.zincrby('body_hashes', 1, h) + pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname) + # set of all captures with this hash + pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid) pipeline.execute() + self.logger.debug(f'done with body hashes for {crawled_tree.uuid}.') def get_hash_uuids(self, body_hash: str) -> tuple[str, str, str]: """Use that to get a reference allowing to fetch a resource from one of the capture.""" @@ -284,10 +252,17 @@ class Indexing(): def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]: return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')] - def _reindex_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None: + def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None: + if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid): + # Do not reindex + return + self.redis.sadd('indexed_hhhashes', crawled_tree.uuid) + self.logger.debug(f'Indexing http headers hashes for {crawled_tree.uuid} ... ') + pipeline = self.redis.pipeline() already_loaded: set[str] = set() already_cleaned_up: set[str] = set() + is_reindex = False for urlnode in crawled_tree.root_hartree.url_tree.traverse(): if 'hhhash' not in urlnode.features: continue @@ -301,28 +276,14 @@ class Indexing(): to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')] if to_remove: pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove) + is_reindex = True + self.logger.debug(f'reindexing http headers hashes for {crawled_tree.uuid} ... ') already_cleaned_up.add(urlnode.hhhash) pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') + if not is_reindex: + pipeline.zincrby('hhhashes', 1, urlnode.hhhash) pipeline.execute() - - def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None: - if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid): - # Do not reindex - return - self.redis.sadd('indexed_hhhashes', crawled_tree.uuid) - - pipeline = self.redis.pipeline() - already_loaded: set[str] = set() - for urlnode in crawled_tree.root_hartree.url_tree.traverse(): - if 'hhhash' not in urlnode.features: - continue - if urlnode.hhhash in already_loaded: - # Only add cookie name once / capture - continue - already_loaded.add(urlnode.hhhash) - pipeline.zincrby('hhhashes', 1, urlnode.hhhash) - pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') - pipeline.execute() + self.logger.debug(f'done with http headers hashes for {crawled_tree.uuid}.') # ###### URLs and Domains ###### @@ -339,18 +300,21 @@ class Indexing(): # Do not reindex return self.redis.sadd('indexed_urls', crawled_tree.uuid) + self.logger.debug(f'Indexing URLs for {crawled_tree.uuid} ... ') pipeline = self.redis.pipeline() for urlnode in crawled_tree.root_hartree.url_tree.traverse(): if not urlnode.hostname or not urlnode.name: continue - pipeline.zincrby('hostnames', 1, urlnode.hostname) - pipeline.sadd(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid) - pipeline.zincrby('urls', 1, urlnode.name) - # set of all captures with this URL - # We need to make sure the keys in redis aren't too long. - md5 = hashlib.md5(urlnode.name.encode()).hexdigest() - pipeline.sadd(f'urls|{md5}|captures', crawled_tree.uuid) + if not self.redis.sismember(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid): + pipeline.zincrby('hostnames', 1, urlnode.hostname) + pipeline.zincrby('urls', 1, urlnode.name) + pipeline.sadd(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid) + # set of all captures with this URL + # We need to make sure the keys in redis aren't too long. + md5 = hashlib.md5(urlnode.name.encode()).hexdigest() + pipeline.sadd(f'urls|{md5}|captures', crawled_tree.uuid) pipeline.execute() + self.logger.debug(f'done with URLs for {crawled_tree.uuid}.') def get_captures_url(self, url: str) -> set[str]: md5 = hashlib.md5(url.encode()).hexdigest() @@ -376,6 +340,7 @@ class Indexing(): # Do not reindex return self.redis.sadd('indexed_favicons', capture_uuid) + self.logger.debug(f'Indexing favicons for {capture_uuid} ... ') pipeline = self.redis.pipeline() with ZipFile(favicons, 'r') as myzip: for name in myzip.namelist(): @@ -386,10 +351,12 @@ class Indexing(): # Empty file, ignore. continue sha = hashlib.sha512(favicon).hexdigest() - pipeline.zincrby('favicons', 1, sha) - pipeline.sadd(f'favicons|{sha}|captures', capture_uuid) - # There is no easi access to the favicons unless we store them in redis - pipeline.set(f'favicons|{sha}', favicon) + if not self.redis.sismember('favicons|{sha}|captures', capture_uuid): + # Do not count the same favicon more than once for the same capture + pipeline.zincrby('favicons', 1, sha) + pipeline.sadd(f'favicons|{sha}|captures', capture_uuid) + # There is no easi access to the favicons unless we store them in redis + pipeline.set(f'favicons|{sha}', favicon) pipeline.execute() def get_captures_favicon(self, favicon_sha512: str) -> set[str]: @@ -404,6 +371,7 @@ class Indexing(): return self.redis.zscore(f'favicons|{algorithm}', phash) def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None: + # FIXME: this method isnt used anymore if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid): # Do not reindex return