lookyloo/lookyloo/indexing.py

#!/usr/bin/env python3

import hashlib
import logging
import re
from collections import defaultdict
from typing import Dict, Iterable, List, Optional, Set, Tuple
from urllib.parse import urlsplit

from har2tree import CrawledTree
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection

from .default import get_socket_path, get_config
from .helpers import get_public_suffix_list


class Indexing():

    def __init__(self) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(get_config('generic', 'loglevel'))
        self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
                                                         path=get_socket_path('indexing'), decode_responses=True)

    def clear_indexes(self):
        self.redis.flushdb()

    @property
    def redis(self):
        return Redis(connection_pool=self.redis_pool)

    def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:
        # only trigger this method if the capture was already indexed.
        if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
            self.logger.info(f'Cookies index: update internal UUIDs for {crawled_tree.uuid}')
            self._reindex_cookies_capture(crawled_tree)
        if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
            self.logger.info(f'Body hashes index: update internal UUIDs for {crawled_tree.uuid}')
            self._reindex_body_hashes_capture(crawled_tree)

    # ###### Cookies ######

    @property
    def cookies_names(self) -> List[Tuple[str, float]]:
        return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)

    def cookies_names_number_domains(self, cookie_name: str) -> int:
        return self.redis.zcard(f'cn|{cookie_name}')

    def cookies_names_domains_values(self, cookie_name: str, domain: str) -> List[Tuple[str, float]]:
        return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True)

    def get_cookie_domains(self, cookie_name: str) -> List[Tuple[str, float]]:
        return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True)

    def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
        return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]

    def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None:
        pipeline = self.redis.pipeline()
        already_loaded: Set[Tuple[str, str]] = set()
        already_cleaned_up = []
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            if hasattr(urlnode, 'cookies_received'):
                for domain, cookie, _ in urlnode.cookies_received:
                    name, value = cookie.split('=', 1)
                    if (name, domain) in already_loaded:
                        # Only add cookie name once / capture
                        continue
                    already_loaded.add((name, domain))
                    if name not in already_cleaned_up:
                        for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*'):
                            pipeline.srem(f'cn|{name}|captures', key)
                    already_cleaned_up.append(name)
                    pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
        pipeline.execute()

    def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_cookies', crawled_tree.uuid)

        pipeline = self.redis.pipeline()
        already_loaded: Set[Tuple[str, str]] = set()
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            if hasattr(urlnode, 'cookies_received'):
                for domain, cookie, _ in urlnode.cookies_received:
                    name, value = cookie.split('=', 1)
                    if (name, domain) in already_loaded:
                        # Only add cookie name once / capture
                        continue
                    already_loaded.add((name, domain))
                    pipeline.zincrby('cookies_names', 1, name)
                    pipeline.zincrby(f'cn|{name}', 1, domain)
                    pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
                    pipeline.zincrby(f'cn|{name}|{domain}', 1, value)

                    pipeline.sadd('lookyloo_domains', domain)
                    pipeline.sadd(domain, name)
        pipeline.execute()

    def aggregate_domain_cookies(self):
        psl = get_public_suffix_list()
        pipeline = self.redis.pipeline()
        for cn, cn_freq in self.cookies_names:
            for domain, d_freq in self.get_cookie_domains(cn):
                tld = psl.get_tld(domain)
                main_domain_part = re.sub(f'.{tld}$', '', domain).split('.')[-1]
                pipeline.zincrby('aggregate_domains_cn', cn_freq, f'{main_domain_part}|{cn}')
                pipeline.zincrby('aggregate_cn_domains', d_freq, f'{cn}|{main_domain_part}')
        pipeline.execute()
        aggregate_domains_cn: List[Tuple[str, float]] = self.redis.zrevrange('aggregate_domains_cn', 0, -1, withscores=True)
        aggregate_cn_domains: List[Tuple[str, float]] = self.redis.zrevrange('aggregate_cn_domains', 0, -1, withscores=True)
        self.redis.delete('aggregate_domains_cn')
        self.redis.delete('aggregate_cn_domains')
        return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains}

    # ###### Body hashes ######

    @property
    def ressources(self) -> List[Tuple[str, float]]:
        return self.redis.zrevrange('body_hashes', 0, 200, withscores=True)

    def ressources_number_domains(self, h: str) -> int:
        return self.redis.zcard(f'bh|{h}')

    def body_hash_fequency(self, body_hash: str) -> Dict[str, int]:
        pipeline = self.redis.pipeline()
        pipeline.zscore('body_hashes', body_hash)
        pipeline.zcard(f'bh|{body_hash}')
        hash_freq, hash_domains_freq = pipeline.execute()
        to_return = {'hash_freq': 0, 'hash_domains_freq': 0}
        if hash_freq:
            to_return['hash_freq'] = int(hash_freq)
        if hash_domains_freq:
            to_return['hash_domains_freq'] = int(hash_domains_freq)
        return to_return

    def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
        # if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed
        cleaned_up_hashes = []
        pipeline = self.redis.pipeline()
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            for h in urlnode.resources_hashes:
                if h not in cleaned_up_hashes:
                    self.redis.delete(f'bh|{h}|captures|{crawled_tree.uuid}')
                cleaned_up_hashes.append(h)
                pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
                                 f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
        pipeline.execute()

    def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)

        pipeline = self.redis.pipeline()
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            for h in urlnode.resources_hashes:
                pipeline.zincrby('body_hashes', 1, h)
                pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname)
                # set of all captures with this hash
                pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
                # ZSet of all urlnode_UUIDs|full_url
                pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
                                 f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
        pipeline.execute()

    def get_hash_uuids(self, body_hash: str) -> Tuple[str, str, str]:
        capture_uuid: str = self.redis.srandmember(f'bh|{body_hash}|captures')
        entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0]
        urlnode_uuid, hostnode_uuid, url = entry.split('|', 2)
        return capture_uuid, urlnode_uuid, hostnode_uuid

    def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None,
                               filter_capture_uuid: Optional[str]=None,
                               limit: int=20,
                               prefered_uuids: Set[str]=set()) -> Tuple[int, List[Tuple[str, str, str, bool]]]:
        '''Get the captures matching the hash.
        :param filter_url: URL of the hash we're searching for
        :param filter_capture_uuid: UUID of the capture the hash was found in
        :param limit: Max matching captures to return
        :param prefered_uuids: UUID cached right now, so we don't rebuild trees.
        '''
        to_return: List[Tuple[str, str, str, bool]] = []
        all_captures: Set[str] = self.redis.smembers(f'bh|{body_hash}|captures')
        len_captures = len(all_captures)
        for capture_uuid in list(all_captures)[:limit]:
            if capture_uuid == filter_capture_uuid:
                # Used to skip hits in current capture
                len_captures -= 1
                continue
            if prefered_uuids and capture_uuid not in prefered_uuids:
                continue
            for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
                url_uuid, hostnode_uuid, url = entry.split('|', 2)
                hostname: str = urlsplit(url).hostname
                if filter_url:
                    to_return.append((capture_uuid, hostnode_uuid, hostname, url == filter_url))
                else:
                    to_return.append((capture_uuid, hostnode_uuid, hostname, False))
        return len_captures, to_return

    def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:
        return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)

    def get_body_hash_urls(self, body_hash: str) -> Dict[str, List[Dict[str, str]]]:
        all_captures: Set[str] = self.redis.smembers(f'bh|{body_hash}|captures')
        urls = defaultdict(list)
        for capture_uuid in list(all_captures):
            for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
                url_uuid, hostnode_uuid, url = entry.split('|', 2)
                urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid})
        return urls

    # ###### URLs and Domains ######

    @property
    def urls(self) -> List[Tuple[str, float]]:
        return self.redis.zrevrange('urls', 0, 200, withscores=True)

    @property
    def hostnames(self) -> List[Tuple[str, float]]:
        return self.redis.zrevrange('hostnames', 0, 200, withscores=True)

    def index_url_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_urls', crawled_tree.uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_urls', crawled_tree.uuid)
        pipeline = self.redis.pipeline()
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            if not urlnode.hostname or not urlnode.name:
                continue
            pipeline.zincrby('hostnames', 1, urlnode.hostname)
            pipeline.sadd(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid)
            pipeline.zincrby('urls', 1, urlnode.name)
            # set of all captures with this URL
            # We need to make sure the keys in redis aren't too long.
            m = hashlib.md5()
            m.update(urlnode.name.encode())
            pipeline.sadd(f'urls|{m.hexdigest()}|captures', crawled_tree.uuid)
        pipeline.execute()

    def get_captures_url(self, url: str) -> Set[str]:
        m = hashlib.md5()
        m.update(url.encode())
        return self.redis.smembers(f'urls|{m.hexdigest()}|captures')

    def get_captures_hostname(self, hostname: str) -> Set[str]:
        return self.redis.smembers(f'hostnames|{hostname}|captures')

    # ###### Categories ######

    @property
    def categories(self) -> List[Tuple[str, int]]:
        return [(c, int(score))
                for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]

    def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):
        if not categories:
            return
        if self.redis.sismember('indexed_categories', capture_uuid):
            # do not reindex
            return
        self.redis.sadd('indexed_categories', capture_uuid)
        if not categories:
            return
        pipeline = self.redis.pipeline()
        for category in categories:
            pipeline.zincrby('categories', 1, category)
            pipeline.sadd(category, capture_uuid)
        pipeline.execute()

    def get_captures_category(self, category: str) -> Set[str]:
        return self.redis.smembers(category)
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`#!/usr/bin/env python3`

new: API to query URLs 2020-10-27 00:02:18 +01:00			`import hashlib`
new: Add logging in Indexing 2021-09-24 16:27:46 +02:00			`import logging`
chg: Add body hash and domains in MISP lookup 2021-06-04 03:40:06 +02:00			`import re`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from collections import defaultdict`
			`from typing import Dict, Iterable, List, Optional, Set, Tuple`
			`from urllib.parse import urlsplit`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00
			`from har2tree import CrawledTree`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from redis import ConnectionPool, Redis`
			`from redis.connection import UnixDomainSocketConnection`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00
chg: use template 2021-10-18 13:06:43 +02:00			`from .default import get_socket_path, get_config`
			`from .helpers import get_public_suffix_list`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00

			`class Indexing():`

			`def __init__(self) -> None:`
new: Add logging in Indexing 2021-09-24 16:27:46 +02:00			`self.logger = logging.getLogger(f'{self.__class__.__name__}')`
			`self.logger.setLevel(get_config('generic', 'loglevel'))`
chg: Use connection pool in indexing 2021-08-18 17:31:17 +02:00			`self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,`
			`path=get_socket_path('indexing'), decode_responses=True)`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00
			`def clear_indexes(self):`
			`self.redis.flushdb()`

chg: Use connection pool in indexing 2021-08-18 17:31:17 +02:00			`@property`
			`def redis(self):`
			`return Redis(connection_pool=self.redis_pool)`

fix: Indexes not updated on tree rebuild, better handling of tree cache 2021-09-24 16:16:41 +02:00			`def new_internal_uuids(self, crawled_tree: CrawledTree) -> None:`
			`# only trigger this method if the capture was already indexed.`
			`if self.redis.sismember('indexed_cookies', crawled_tree.uuid):`
new: Add logging in Indexing 2021-09-24 16:27:46 +02:00			`self.logger.info(f'Cookies index: update internal UUIDs for {crawled_tree.uuid}')`
fix: Indexes not updated on tree rebuild, better handling of tree cache 2021-09-24 16:16:41 +02:00			`self._reindex_cookies_capture(crawled_tree)`
			`if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):`
new: Add logging in Indexing 2021-09-24 16:27:46 +02:00			`self.logger.info(f'Body hashes index: update internal UUIDs for {crawled_tree.uuid}')`
fix: Indexes not updated on tree rebuild, better handling of tree cache 2021-09-24 16:16:41 +02:00			`self._reindex_body_hashes_capture(crawled_tree)`

chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`# ###### Cookies ######`

			`@property`
			`def cookies_names(self) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)`

			`def cookies_names_number_domains(self, cookie_name: str) -> int:`
			`return self.redis.zcard(f'cn\|{cookie_name}')`

			`def cookies_names_domains_values(self, cookie_name: str, domain: str) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange(f'cn\|{cookie_name}\|{domain}', 0, -1, withscores=True)`

			`def get_cookie_domains(self, cookie_name: str) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange(f'cn\|{cookie_name}', 0, -1, withscores=True)`

			`def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`return [uuids.split('\|') for uuids in self.redis.smembers(f'cn\|{cookie_name}\|captures')]`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00
fix: Indexes not updated on tree rebuild, better handling of tree cache 2021-09-24 16:16:41 +02:00			`def _reindex_cookies_capture(self, crawled_tree: CrawledTree) -> None:`
			`pipeline = self.redis.pipeline()`
			`already_loaded: Set[Tuple[str, str]] = set()`
			`already_cleaned_up = []`
			`for urlnode in crawled_tree.root_hartree.url_tree.traverse():`
			`if hasattr(urlnode, 'cookies_received'):`
			`for domain, cookie, _ in urlnode.cookies_received:`
			`name, value = cookie.split('=', 1)`
			`if (name, domain) in already_loaded:`
			`# Only add cookie name once / capture`
			`continue`
			`already_loaded.add((name, domain))`
			`if name not in already_cleaned_up:`
			`for key in self.redis.sscan_iter(f'cn\|{name}\|captures', f'{crawled_tree.uuid}\|*'):`
			`pipeline.srem(f'cn\|{name}\|captures', key)`
			`already_cleaned_up.append(name)`
			`pipeline.sadd(f'cn\|{name}\|captures', f'{crawled_tree.uuid}\|{urlnode.uuid}')`
			`pipeline.execute()`

chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:`
			`if self.redis.sismember('indexed_cookies', crawled_tree.uuid):`
			`# Do not reindex`
			`return`
			`self.redis.sadd('indexed_cookies', crawled_tree.uuid)`

			`pipeline = self.redis.pipeline()`
			`already_loaded: Set[Tuple[str, str]] = set()`
			`for urlnode in crawled_tree.root_hartree.url_tree.traverse():`
			`if hasattr(urlnode, 'cookies_received'):`
			`for domain, cookie, _ in urlnode.cookies_received:`
			`name, value = cookie.split('=', 1)`
			`if (name, domain) in already_loaded:`
			`# Only add cookie name once / capture`
			`continue`
			`already_loaded.add((name, domain))`
			`pipeline.zincrby('cookies_names', 1, name)`
			`pipeline.zincrby(f'cn\|{name}', 1, domain)`
			`pipeline.sadd(f'cn\|{name}\|captures', f'{crawled_tree.uuid}\|{urlnode.uuid}')`
			`pipeline.zincrby(f'cn\|{name}\|{domain}', 1, value)`

			`pipeline.sadd('lookyloo_domains', domain)`
			`pipeline.sadd(domain, name)`
			`pipeline.execute()`

			`def aggregate_domain_cookies(self):`
			`psl = get_public_suffix_list()`
			`pipeline = self.redis.pipeline()`
			`for cn, cn_freq in self.cookies_names:`
			`for domain, d_freq in self.get_cookie_domains(cn):`
			`tld = psl.get_tld(domain)`
chg: Add body hash and domains in MISP lookup 2021-06-04 03:40:06 +02:00			`main_domain_part = re.sub(f'.{tld}$', '', domain).split('.')[-1]`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`pipeline.zincrby('aggregate_domains_cn', cn_freq, f'{main_domain_part}\|{cn}')`
			`pipeline.zincrby('aggregate_cn_domains', d_freq, f'{cn}\|{main_domain_part}')`
			`pipeline.execute()`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`aggregate_domains_cn: List[Tuple[str, float]] = self.redis.zrevrange('aggregate_domains_cn', 0, -1, withscores=True)`
			`aggregate_cn_domains: List[Tuple[str, float]] = self.redis.zrevrange('aggregate_cn_domains', 0, -1, withscores=True)`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`self.redis.delete('aggregate_domains_cn')`
			`self.redis.delete('aggregate_cn_domains')`
			`return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains}`

			`# ###### Body hashes ######`

			`@property`
			`def ressources(self) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange('body_hashes', 0, 200, withscores=True)`

			`def ressources_number_domains(self, h: str) -> int:`
			`return self.redis.zcard(f'bh\|{h}')`

fix: No exception if body_hash is not indexed 2020-11-06 15:59:13 +01:00			`def body_hash_fequency(self, body_hash: str) -> Dict[str, int]:`
			`pipeline = self.redis.pipeline()`
			`pipeline.zscore('body_hashes', body_hash)`
			`pipeline.zcard(f'bh\|{body_hash}')`
			`hash_freq, hash_domains_freq = pipeline.execute()`
			`to_return = {'hash_freq': 0, 'hash_domains_freq': 0}`
			`if hash_freq:`
			`to_return['hash_freq'] = int(hash_freq)`
			`if hash_domains_freq:`
			`to_return['hash_domains_freq'] = int(hash_domains_freq)`
			`return to_return`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00
fix: Indexes not updated on tree rebuild, better handling of tree cache 2021-09-24 16:16:41 +02:00			`def _reindex_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:`
			`# if the capture is regenerated, the hostnodes/urlnodes UUIDs are changed`
			`cleaned_up_hashes = []`
			`pipeline = self.redis.pipeline()`
			`for urlnode in crawled_tree.root_hartree.url_tree.traverse():`
			`for h in urlnode.resources_hashes:`
			`if h not in cleaned_up_hashes:`
			`self.redis.delete(f'bh\|{h}\|captures\|{crawled_tree.uuid}')`
			`cleaned_up_hashes.append(h)`
			`pipeline.zincrby(f'bh\|{h}\|captures\|{crawled_tree.uuid}', 1,`
			`f'{urlnode.uuid}\|{urlnode.hostnode_uuid}\|{urlnode.name}')`
			`pipeline.execute()`

chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:`
			`if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):`
			`# Do not reindex`
			`return`
			`self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)`

			`pipeline = self.redis.pipeline()`
			`for urlnode in crawled_tree.root_hartree.url_tree.traverse():`
			`for h in urlnode.resources_hashes:`
			`pipeline.zincrby('body_hashes', 1, h)`
			`pipeline.zincrby(f'bh\|{h}', 1, urlnode.hostname)`
			`# set of all captures with this hash`
			`pipeline.sadd(f'bh\|{h}\|captures', crawled_tree.uuid)`
			`# ZSet of all urlnode_UUIDs\|full_url`
new: Hash lookup method 2020-10-23 20:51:15 +02:00			`pipeline.zincrby(f'bh\|{h}\|captures\|{crawled_tree.uuid}', 1,`
			`f'{urlnode.uuid}\|{urlnode.hostnode_uuid}\|{urlnode.name}')`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`pipeline.execute()`

			`def get_hash_uuids(self, body_hash: str) -> Tuple[str, str, str]:`
chg: Use connection pool whenever possible 2021-08-18 18:01:04 +02:00			`capture_uuid: str = self.redis.srandmember(f'bh\|{body_hash}\|captures')`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`entry = self.redis.zrange(f'bh\|{body_hash}\|captures\|{capture_uuid}', 0, 1)[0]`
			`urlnode_uuid, hostnode_uuid, url = entry.split('\|', 2)`
			`return capture_uuid, urlnode_uuid, hostnode_uuid`

			`def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None,`
			`filter_capture_uuid: Optional[str]=None,`
fix: Speedup display of hostnode popup 2022-07-27 14:36:56 +02:00			`limit: int=20,`
			`prefered_uuids: Set[str]=set()) -> Tuple[int, List[Tuple[str, str, str, bool]]]:`
			`'''Get the captures matching the hash.`
			`:param filter_url: URL of the hash we're searching for`
			`:param filter_capture_uuid: UUID of the capture the hash was found in`
			`:param limit: Max matching captures to return`
			`:param prefered_uuids: UUID cached right now, so we don't rebuild trees.`
			`'''`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`to_return: List[Tuple[str, str, str, bool]] = []`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`all_captures: Set[str] = self.redis.smembers(f'bh\|{body_hash}\|captures')`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`len_captures = len(all_captures)`
			`for capture_uuid in list(all_captures)[:limit]:`
			`if capture_uuid == filter_capture_uuid:`
			`# Used to skip hits in current capture`
			`len_captures -= 1`
			`continue`
fix: Speedup display of hostnode popup 2022-07-27 14:36:56 +02:00			`if prefered_uuids and capture_uuid not in prefered_uuids:`
			`continue`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`for entry in self.redis.zrevrange(f'bh\|{body_hash}\|captures\|{capture_uuid}', 0, -1):`
			`url_uuid, hostnode_uuid, url = entry.split('\|', 2)`
chg: Make new version of mypy happy 2020-10-12 12:15:07 +02:00			`hostname: str = urlsplit(url).hostname`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`if filter_url:`
chg: Make new version of mypy happy 2020-10-12 12:15:07 +02:00			`to_return.append((capture_uuid, hostnode_uuid, hostname, url == filter_url))`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`else:`
chg: Make new version of mypy happy 2020-10-12 12:15:07 +02:00			`to_return.append((capture_uuid, hostnode_uuid, hostname, False))`
chg: Refactoring, add get_hashes 2020-10-09 18:05:04 +02:00			`return len_captures, to_return`

			`def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange(f'bh\|{body_hash}', 0, -1, withscores=True)`
new: Hash lookup method 2020-10-23 20:51:15 +02:00
			`def get_body_hash_urls(self, body_hash: str) -> Dict[str, List[Dict[str, str]]]:`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`all_captures: Set[str] = self.redis.smembers(f'bh\|{body_hash}\|captures')`
new: Hash lookup method 2020-10-23 20:51:15 +02:00			`urls = defaultdict(list)`
			`for capture_uuid in list(all_captures):`
			`for entry in self.redis.zrevrange(f'bh\|{body_hash}\|captures\|{capture_uuid}', 0, -1):`
			`url_uuid, hostnode_uuid, url = entry.split('\|', 2)`
			`urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid})`
			`return urls`
new: API to query URLs 2020-10-27 00:02:18 +01:00
			`# ###### URLs and Domains ######`

			`@property`
			`def urls(self) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange('urls', 0, 200, withscores=True)`

			`@property`
			`def hostnames(self) -> List[Tuple[str, float]]:`
			`return self.redis.zrevrange('hostnames', 0, 200, withscores=True)`

			`def index_url_capture(self, crawled_tree: CrawledTree) -> None:`
			`if self.redis.sismember('indexed_urls', crawled_tree.uuid):`
fix: No exception if body_hash is not indexed 2020-11-06 15:59:13 +01:00			`# Do not reindex`
new: API to query URLs 2020-10-27 00:02:18 +01:00			`return`
fix: Avoid indexing URLs multiple times 2021-03-12 13:18:36 +01:00			`self.redis.sadd('indexed_urls', crawled_tree.uuid)`
new: API to query URLs 2020-10-27 00:02:18 +01:00			`pipeline = self.redis.pipeline()`
			`for urlnode in crawled_tree.root_hartree.url_tree.traverse():`
			`if not urlnode.hostname or not urlnode.name:`
			`continue`
			`pipeline.zincrby('hostnames', 1, urlnode.hostname)`
			`pipeline.sadd(f'hostnames\|{urlnode.hostname}\|captures', crawled_tree.uuid)`
			`pipeline.zincrby('urls', 1, urlnode.name)`
			`# set of all captures with this URL`
			`# We need to make sure the keys in redis aren't too long.`
			`m = hashlib.md5()`
			`m.update(urlnode.name.encode())`
			`pipeline.sadd(f'urls\|{m.hexdigest()}\|captures', crawled_tree.uuid)`
			`pipeline.execute()`

			`def get_captures_url(self, url: str) -> Set[str]:`
			`m = hashlib.md5()`
			`m.update(url.encode())`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`return self.redis.smembers(f'urls\|{m.hexdigest()}\|captures')`
new: API to query URLs 2020-10-27 00:02:18 +01:00
			`def get_captures_hostname(self, hostname: str) -> Set[str]:`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`return self.redis.smembers(f'hostnames\|{hostname}\|captures')`
new: Integrate categorization in indexing 2020-11-09 16:02:54 +01:00
			`# ###### Categories ######`

			`@property`
			`def categories(self) -> List[Tuple[str, int]]:`
chg: Use connection pool whenever possible 2021-08-18 18:01:04 +02:00			`return [(c, int(score))`
new: Integrate categorization in indexing 2020-11-09 16:02:54 +01:00			`for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]`

			`def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):`
			`if not categories:`
			`return`
			`if self.redis.sismember('indexed_categories', capture_uuid):`
			`# do not reindex`
			`return`
			`self.redis.sadd('indexed_categories', capture_uuid)`
			`if not categories:`
			`return`
			`pipeline = self.redis.pipeline()`
			`for category in categories:`
			`pipeline.zincrby('categories', 1, category)`
			`pipeline.sadd(category, capture_uuid)`
			`pipeline.execute()`

			`def get_captures_category(self, category: str) -> Set[str]:`
chg: Bump mypy 2021-06-09 21:04:26 +02:00			`return self.redis.smembers(category)`