lookyloo/lookyloo/indexing.py

#!/usr/bin/env python3

from __future__ import annotations

import base64
import hashlib
import logging
# import re
from io import BytesIO
from collections import defaultdict
from typing import Iterable
from urllib.parse import urlsplit
from zipfile import ZipFile

import mmh3

from har2tree import CrawledTree
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection

from .default import get_socket_path, get_config
# from .helpers import get_public_suffix_list


class Indexing():

    def __init__(self, full_index: bool=False) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(get_config('generic', 'loglevel'))
        self.__redis_pool_bytes: ConnectionPool
        self.__redis_pool: ConnectionPool
        if full_index:
            self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
                                                     path=get_socket_path('full_index'))
            self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
                                               path=get_socket_path('full_index'), decode_responses=True)
        else:
            self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
                                                     path=get_socket_path('indexing'))
            self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
                                               path=get_socket_path('indexing'), decode_responses=True)

    def clear_indexes(self) -> None:
        self.redis.flushdb()

    @property
    def redis_bytes(self) -> Redis:  # type: ignore[type-arg]
        return Redis(connection_pool=self.__redis_pool_bytes)

    @property
    def redis(self) -> Redis:  # type: ignore[type-arg]
        return Redis(connection_pool=self.__redis_pool)

    @property
    def can_index(self) -> bool:
        return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))

    def indexing_done(self) -> None:
        self.redis.delete('ongoing_indexing')

    def force_reindex(self, capture_uuid: str) -> None:
        p = self.redis.pipeline()
        p.srem('indexed_urls', capture_uuid)
        p.srem('indexed_body_hashes', capture_uuid)
        p.srem('indexed_cookies', capture_uuid)
        p.srem('indexed_hhhashes', capture_uuid)
        p.srem('indexed_favicons', capture_uuid)
        p.srem('indexed_identifiers', capture_uuid)
        p.execute()

    def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]:
        p = self.redis.pipeline()
        p.sismember('indexed_urls', capture_uuid)
        p.sismember('indexed_body_hashes', capture_uuid)
        p.sismember('indexed_cookies', capture_uuid)
        p.sismember('indexed_hhhashes', capture_uuid)
        p.sismember('indexed_favicons', capture_uuid)
        p.sismember('indexed_identifiers', capture_uuid)
        # This call for sure returns a tuple of 6 booleans
        return p.execute()  # type: ignore[return-value]

    # ###### Cookies ######

    @property
    def cookies_names(self) -> list[tuple[str, float]]:
        return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)

    def cookies_names_number_domains(self, cookie_name: str) -> int:
        return self.redis.zcard(f'cn|{cookie_name}')

    def cookies_names_domains_values(self, cookie_name: str, domain: str) -> list[tuple[str, float]]:
        return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True)

    def get_cookie_domains(self, cookie_name: str) -> list[tuple[str, float]]:
        return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True)

    def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]:
        return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]

    def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
            # Do not reindex
            return
        self.logger.debug(f'Indexing cookies for {crawled_tree.uuid} ... ')
        self.redis.sadd('indexed_cookies', crawled_tree.uuid)

        pipeline = self.redis.pipeline()
        already_loaded: set[tuple[str, str]] = set()
        # used if we need to reindex a capture
        already_cleaned_up: set[str] = set()
        is_reindex = False
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            if 'cookies_received' not in urlnode.features:
                continue
            for domain, cookie, _ in urlnode.cookies_received:
                name, value = cookie.split('=', 1)
                if (name, domain) in already_loaded:
                    # Only add cookie name once / capture
                    continue
                already_loaded.add((name, domain))
                if name not in already_cleaned_up:
                    # We only run this srem once per name for a capture,
                    # before adding it for the first time
                    to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')]
                    if to_remove:
                        pipeline.srem(f'cn|{name}|captures', *to_remove)
                        is_reindex = True
                        self.logger.debug(f'reindexing cookies for {crawled_tree.uuid} ... ')
                    already_cleaned_up.add(name)
                pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
                if not is_reindex:
                    pipeline.zincrby('cookies_names', 1, name)
                    pipeline.zincrby(f'cn|{name}', 1, domain)
                    pipeline.zincrby(f'cn|{name}|{domain}', 1, value)
                    pipeline.sadd(domain, name)
        pipeline.execute()
        self.logger.debug(f'done with cookies for {crawled_tree.uuid}.')

    # ###### Body hashes ######

    @property
    def ressources(self) -> list[tuple[str, float]]:
        return self.redis.zrevrange('body_hashes', 0, 200, withscores=True)

    def ressources_number_domains(self, h: str) -> int:
        return self.redis.zcard(f'bh|{h}')

    def body_hash_fequency(self, body_hash: str) -> dict[str, int]:
        pipeline = self.redis.pipeline()
        pipeline.zscore('body_hashes', body_hash)
        pipeline.zcard(f'bh|{body_hash}')
        hash_freq, hash_domains_freq = pipeline.execute()
        to_return = {'hash_freq': 0, 'hash_domains_freq': 0}
        if hash_freq:
            to_return['hash_freq'] = int(hash_freq)
        if hash_domains_freq:
            to_return['hash_domains_freq'] = int(hash_domains_freq)
        return to_return

    def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)
        self.logger.debug(f'Indexing body hashes for {crawled_tree.uuid} ... ')

        cleaned_up_hashes: set[str] = set()
        pipeline = self.redis.pipeline()
        is_reindex = False
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            for h in urlnode.resources_hashes:
                if h not in cleaned_up_hashes:
                    # Delete the hash for that capture the first time we see it.
                    if self.redis.exists(f'bh|{h}|captures|{crawled_tree.uuid}'):
                        pipeline.delete(f'bh|{h}|captures|{crawled_tree.uuid}')
                        cleaned_up_hashes.add(h)
                        is_reindex = True
                        self.logger.debug(f'reindexing body hashes for {crawled_tree.uuid} ... ')
                # ZSet of all urlnode_UUIDs|full_url
                pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
                                 f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
                if not is_reindex:
                    pipeline.zincrby('body_hashes', 1, h)
                    pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname)
                    # set of all captures with this hash
                    pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
        pipeline.execute()
        self.logger.debug(f'done with body hashes for {crawled_tree.uuid}.')

    def get_hash_uuids(self, body_hash: str) -> tuple[str, str, str]:
        """Use that to get a reference allowing to fetch a resource from one of the capture."""
        capture_uuid = str(self.redis.srandmember(f'bh|{body_hash}|captures'))
        entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0]
        urlnode_uuid, hostnode_uuid, url = entry.split('|', 2)
        return capture_uuid, urlnode_uuid, hostnode_uuid

    def get_body_hash_captures(self, body_hash: str, filter_url: str | None=None,
                               filter_capture_uuid: str | None=None,
                               limit: int=20,
                               prefered_uuids: set[str]=set()) -> tuple[int, list[tuple[str, str, str, bool, str]]]:
        '''Get the captures matching the hash.

        :param filter_url: URL of the hash we're searching for
        :param filter_capture_uuid: UUID of the capture the hash was found in
        :param limit: Max matching captures to return, -1 means unlimited.
        :param prefered_uuids: UUID cached right now, so we don't rebuild trees.
        '''
        to_return: list[tuple[str, str, str, bool, str]] = []
        len_captures = self.redis.scard(f'bh|{body_hash}|captures')
        unlimited = False
        if limit == -1:
            unlimited = True
        for capture_uuid in self.redis.sscan_iter(f'bh|{body_hash}|captures'):
            if capture_uuid == filter_capture_uuid:
                # Used to skip hits in current capture
                len_captures -= 1
                continue
            if prefered_uuids and capture_uuid not in prefered_uuids:
                continue
            if not unlimited:
                limit -= 1
            for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
                url_uuid, hostnode_uuid, url = entry.split('|', 2)
                hostname: str = urlsplit(url).hostname
                if filter_url:
                    to_return.append((capture_uuid, hostnode_uuid, hostname, url == filter_url, url))
                else:
                    to_return.append((capture_uuid, hostnode_uuid, hostname, False, url))
            if not unlimited and limit <= 0:
                break
        return len_captures, to_return

    def get_body_hash_domains(self, body_hash: str) -> list[tuple[str, float]]:
        return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)

    def get_body_hash_urls(self, body_hash: str) -> dict[str, list[dict[str, str]]]:
        all_captures: set[str] = self.redis.smembers(f'bh|{body_hash}|captures')
        urls = defaultdict(list)
        for capture_uuid in list(all_captures):
            for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
                url_uuid, hostnode_uuid, url = entry.split('|', 2)
                urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid})
        return urls

    # ###### HTTP Headers Hashes ######

    @property
    def http_headers_hashes(self) -> list[tuple[str, float]]:
        return self.redis.zrevrange('hhhashes', 0, -1, withscores=True)

    def http_headers_hashes_number_captures(self, hhh: str) -> int:
        return self.redis.scard(f'hhhashes|{hhh}|captures')

    def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]:
        return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')]

    def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_hhhashes', crawled_tree.uuid)
        self.logger.debug(f'Indexing http headers hashes for {crawled_tree.uuid} ... ')

        pipeline = self.redis.pipeline()
        already_loaded: set[str] = set()
        already_cleaned_up: set[str] = set()
        is_reindex = False
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            if 'hhhash' not in urlnode.features:
                continue
            if urlnode.hhhash in already_loaded:
                # Only add cookie name once / capture
                continue
            already_loaded.add(urlnode.hhhash)
            if urlnode.hhhash not in already_cleaned_up:
                # We only run this srem once per name for a capture,
                # before adding it for the first time
                to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')]
                if to_remove:
                    pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove)
                    is_reindex = True
                    self.logger.debug(f'reindexing http headers hashes for {crawled_tree.uuid} ... ')
                already_cleaned_up.add(urlnode.hhhash)
            pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
            if not is_reindex:
                pipeline.zincrby('hhhashes', 1, urlnode.hhhash)
        pipeline.execute()
        self.logger.debug(f'done with http headers hashes for {crawled_tree.uuid}.')

    # ###### URLs and Domains ######

    @property
    def urls(self) -> list[tuple[str, float]]:
        return self.redis.zrevrange('urls', 0, 200, withscores=True)

    @property
    def hostnames(self) -> list[tuple[str, float]]:
        return self.redis.zrevrange('hostnames', 0, 200, withscores=True)

    def index_url_capture(self, crawled_tree: CrawledTree) -> None:
        if self.redis.sismember('indexed_urls', crawled_tree.uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_urls', crawled_tree.uuid)
        self.logger.debug(f'Indexing URLs for {crawled_tree.uuid} ... ')
        pipeline = self.redis.pipeline()
        for urlnode in crawled_tree.root_hartree.url_tree.traverse():
            if not urlnode.hostname or not urlnode.name:
                continue
            if not self.redis.sismember(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid):
                pipeline.zincrby('hostnames', 1, urlnode.hostname)
                pipeline.zincrby('urls', 1, urlnode.name)
                pipeline.sadd(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid)
                # set of all captures with this URL
                # We need to make sure the keys in redis aren't too long.
                md5 = hashlib.md5(urlnode.name.encode()).hexdigest()
                pipeline.sadd(f'urls|{md5}|captures', crawled_tree.uuid)
        pipeline.execute()
        self.logger.debug(f'done with URLs for {crawled_tree.uuid}.')

    def get_captures_url(self, url: str) -> set[str]:
        md5 = hashlib.md5(url.encode()).hexdigest()
        return self.redis.smembers(f'urls|{md5}|captures')

    def get_captures_hostname(self, hostname: str) -> set[str]:
        return self.redis.smembers(f'hostnames|{hostname}|captures')

    # ###### favicons ######

    @property
    def favicons(self) -> list[tuple[str, float]]:
        return self.redis.zrevrange('favicons', 0, 200, withscores=True)

    def favicon_frequency(self, favicon_sha512: str) -> float | None:
        return self.redis.zscore('favicons', favicon_sha512)

    def favicon_number_captures(self, favicon_sha512: str) -> int:
        return self.redis.scard(f'favicons|{favicon_sha512}|captures')

    def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
        if self.redis.sismember('indexed_favicons', capture_uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_favicons', capture_uuid)
        self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
        pipeline = self.redis.pipeline()
        with ZipFile(favicons, 'r') as myzip:
            for name in myzip.namelist():
                if not name.endswith('.ico'):
                    continue
                favicon = myzip.read(name)
                if not favicon:
                    # Empty file, ignore.
                    continue
                sha = hashlib.sha512(favicon).hexdigest()
                if not self.redis.sismember('favicons|{sha}|captures', capture_uuid):
                    # Do not count the same favicon more than once for the same capture
                    pipeline.zincrby('favicons', 1, sha)
                    pipeline.sadd(f'favicons|{sha}|captures', capture_uuid)
                    # There is no easi access to the favicons unless we store them in redis
                    pipeline.set(f'favicons|{sha}', favicon)
        pipeline.execute()

    def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
        return self.redis.smembers(f'favicons|{favicon_sha512}|captures')

    def get_favicon(self, favicon_sha512: str) -> bytes | None:
        return self.redis_bytes.get(f'favicons|{favicon_sha512}')

    # ###### identifiers ######

    def identifiers_types(self) -> set[str]:
        return self.redis.smembers('identifiers_types')

    def identifiers(self, identifier_type: str) -> list[tuple[str, float]]:
        return self.redis.zrevrange(f'identifiers|{identifier_type}', 0, 200, withscores=True)

    def identifier_frequency(self, identifier_type: str, identifier: str) -> float | None:
        return self.redis.zscore(f'identifiers|{identifier_type}', identifier)

    def identifier_number_captures(self, identifier_type: str, identifier: str) -> int:
        return self.redis.scard(f'identifiers|{identifier_type}|{identifier}|captures')

    def index_identifiers_capture(self, crawled_tree: CrawledTree) -> None:
        capture_uuid = crawled_tree.uuid
        if self.redis.sismember('indexed_identifiers', capture_uuid):
            # Do not reindex
            return
        self.redis.sadd('indexed_identifiers', capture_uuid)
        if (not hasattr(crawled_tree.root_hartree.rendered_node, 'identifiers')
                or not crawled_tree.root_hartree.rendered_node.identifiers):
            return
        pipeline = self.redis.pipeline()
        # We have multiple identifiers types, this is the difference with the other indexes
        for identifier_type, id_values in crawled_tree.root_hartree.rendered_node.identifiers.items():
            pipeline.sadd('identifiers_types', identifier_type)  # no-op if already there
            if self.redis.sismember(f'indexed_identifiers|{identifier_type}|captures', capture_uuid):
                # Do not reindex the same identifier type for the same capture
                continue
            pipeline.sadd(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
            self.logger.debug(f'Indexing identifiers {identifier_type} for {capture_uuid} ... ')
            for identifier in id_values:
                if self.redis.sismember(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid):
                    # Already counted this specific identifier for this capture
                    continue
                pipeline.sadd(f'identifiers|{capture_uuid}', identifier_type)
                pipeline.sadd(f'identifiers|{capture_uuid}|{identifier_type}', identifier)
                pipeline.sadd(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid)
                pipeline.zincrby(f'identifiers|{identifier_type}', 1, identifier)
        pipeline.execute()

    def get_identifiers_capture(self, capture_uuid: str) -> dict[str, set[str]]:
        to_return = {}
        for identifier_type in self.redis.smembers(f'identifiers|{capture_uuid}'):
            to_return[identifier_type] = self.redis.smembers(f'identifiers|{capture_uuid}|{identifier_type}')
        return to_return

    def get_captures_identifier(self, identifier_type: str, identifier: str) -> set[str]:
        return self.redis.smembers(f'identifiers|{identifier_type}|{identifier}|captures')

    # ###### favicons probabilistic hashes ######

    def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
        return self.redis.zscore(f'favicons|{algorithm}', phash)

    def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
        # FIXME: this method isnt used anymore
        if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
            # Do not reindex
            return
        self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
        pipeline = self.redis.pipeline()
        with ZipFile(favicons, 'r') as myzip:
            for name in myzip.namelist():
                if not name.endswith('.ico'):
                    continue
                favicon = myzip.read(name)
                if not favicon:
                    # Empty file, ignore.
                    continue
                sha = hashlib.sha512(favicon).hexdigest()
                if algorithm == 'mmh3-shodan':
                    # Shodan uses a weird technique:
                    # 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
                    # 2. hashes the base64 string with mmh3
                    b64 = base64.encodebytes(favicon)
                    h = str(mmh3.hash(b64))
                else:
                    raise NotImplementedError(f'Unknown algorithm: {algorithm}')
                pipeline.zincrby(f'favicons|{algorithm}', 1, h)
                # All captures with this hash for this algorithm
                pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
                # All hashes with this hash for this algorithm
                pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
                # reverse lookup to get probabilistic hashes related to a specific favicon
                pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
        pipeline.execute()

    def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
        '''All the favicon sha512 for this probabilistic hash for this algorithm'''
        return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')

    def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
        '''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
        return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')

    def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
        '''All the captures with this probabilistic hash for this algorithm'''
        return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')

    # ###### Categories ######

    @property
    def categories(self) -> list[tuple[str, int]]:
        return [(c, int(score))
                for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]

    def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]) -> None:
        if not categories:
            return
        if self.redis.sismember('indexed_categories', capture_uuid):
            # do not reindex
            return
        self.redis.sadd('indexed_categories', capture_uuid)
        if not categories:
            return
        pipeline = self.redis.pipeline()
        for category in categories:
            pipeline.zincrby('categories', 1, category)
            pipeline.sadd(category, capture_uuid)
        pipeline.execute()

    def get_captures_category(self, category: str) -> set[str]:
        return self.redis.smembers(category)