lookyloo/lookyloo/indexing.py

572 lines
28 KiB
Python
Raw Normal View History

2020-10-09 18:05:04 +02:00
#!/usr/bin/env python3
2024-01-12 17:15:41 +01:00
from __future__ import annotations
2024-02-26 17:07:23 +01:00
import base64
2020-10-27 00:02:18 +01:00
import hashlib
2021-09-24 16:27:46 +02:00
import logging
2023-07-21 15:48:20 +02:00
# import re
2024-02-19 16:15:52 +01:00
from io import BytesIO
2021-09-07 12:59:31 +02:00
from collections import defaultdict
2024-01-13 01:24:32 +01:00
from typing import Iterable
2021-09-07 12:59:31 +02:00
from urllib.parse import urlsplit
2024-02-19 16:15:52 +01:00
from zipfile import ZipFile
2020-10-09 18:05:04 +02:00
2024-02-26 17:07:23 +01:00
import mmh3
from bs4 import BeautifulSoup
from hashlib import sha256
2024-02-05 16:33:46 +01:00
from har2tree import CrawledTree
2021-09-07 12:59:31 +02:00
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection
2020-10-09 18:05:04 +02:00
2024-04-11 17:53:05 +02:00
from .default import get_socket_path, get_config
2023-07-21 15:48:20 +02:00
# from .helpers import get_public_suffix_list
2020-10-09 18:05:04 +02:00
class Indexing():
2024-03-05 20:51:21 +01:00
def __init__(self, full_index: bool=False) -> None:
2021-09-24 16:27:46 +02:00
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel'))
2024-03-05 20:51:21 +01:00
self.__redis_pool_bytes: ConnectionPool
self.__redis_pool: ConnectionPool
if full_index:
self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('full_index'))
self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('full_index'), decode_responses=True)
else:
self.__redis_pool_bytes = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'))
self.__redis_pool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'), decode_responses=True)
2020-10-09 18:05:04 +02:00
2024-01-12 17:15:41 +01:00
def clear_indexes(self) -> None:
2020-10-09 18:05:04 +02:00
self.redis.flushdb()
2024-02-19 16:15:52 +01:00
@property
def redis_bytes(self) -> Redis: # type: ignore[type-arg]
2024-03-05 20:51:21 +01:00
return Redis(connection_pool=self.__redis_pool_bytes)
2024-02-19 16:15:52 +01:00
2021-08-18 17:31:17 +02:00
@property
2024-01-12 17:15:41 +01:00
def redis(self) -> Redis: # type: ignore[type-arg]
2024-03-05 20:51:21 +01:00
return Redis(connection_pool=self.__redis_pool)
@property
def can_index(self) -> bool:
return bool(self.redis.set('ongoing_indexing', 1, ex=3600, nx=True))
def indexing_done(self) -> None:
self.redis.delete('ongoing_indexing')
2024-03-11 00:14:07 +01:00
def force_reindex(self, capture_uuid: str) -> None:
p = self.redis.pipeline()
p.srem('indexed_urls', capture_uuid)
p.srem('indexed_body_hashes', capture_uuid)
p.srem('indexed_cookies', capture_uuid)
p.srem('indexed_hhhashes', capture_uuid)
p.srem('indexed_favicons', capture_uuid)
2024-03-14 00:56:28 +01:00
p.srem('indexed_identifiers', capture_uuid)
for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
2024-03-11 00:14:07 +01:00
p.execute()
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]:
2024-03-05 20:51:21 +01:00
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
p.sismember('indexed_cookies', capture_uuid)
p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid)
2024-03-14 00:56:28 +01:00
p.sismember('indexed_identifiers', capture_uuid)
# We also need to check if the hash_type are all indexed for this capture
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
to_return: list[bool] = p.execute()
to_return.append(hash_types_indexed)
# This call for sure returns a tuple of 7 booleans
return tuple(to_return) # type: ignore[return-value]
2021-08-18 17:31:17 +02:00
2020-10-09 18:05:04 +02:00
# ###### Cookies ######
@property
2024-01-12 17:15:41 +01:00
def cookies_names(self) -> list[tuple[str, float]]:
2020-10-09 18:05:04 +02:00
return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)
def cookies_names_number_domains(self, cookie_name: str) -> int:
return self.redis.zcard(f'cn|{cookie_name}')
2024-01-12 17:15:41 +01:00
def cookies_names_domains_values(self, cookie_name: str, domain: str) -> list[tuple[str, float]]:
2020-10-09 18:05:04 +02:00
return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True)
2024-01-12 17:15:41 +01:00
def get_cookie_domains(self, cookie_name: str) -> list[tuple[str, float]]:
2020-10-09 18:05:04 +02:00
return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True)
2024-01-12 17:15:41 +01:00
def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]:
2021-06-09 21:04:26 +02:00
return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
2020-10-09 18:05:04 +02:00
2024-03-11 00:14:07 +01:00
def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
# Do not reindex
return
self.logger.debug(f'Indexing cookies for {crawled_tree.uuid} ... ')
self.redis.sadd('indexed_cookies', crawled_tree.uuid)
pipeline = self.redis.pipeline()
2024-01-12 17:15:41 +01:00
already_loaded: set[tuple[str, str]] = set()
2024-03-11 00:14:07 +01:00
# used if we need to reindex a capture
2024-01-12 17:15:41 +01:00
already_cleaned_up: set[str] = set()
2024-03-11 00:14:07 +01:00
is_reindex = False
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if 'cookies_received' not in urlnode.features:
2023-07-21 15:48:20 +02:00
continue
for domain, cookie, _ in urlnode.cookies_received:
name, value = cookie.split('=', 1)
if (name, domain) in already_loaded:
# Only add cookie name once / capture
continue
already_loaded.add((name, domain))
if name not in already_cleaned_up:
# We only run this srem once per name for a capture,
# before adding it for the first time
to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')]
if to_remove:
pipeline.srem(f'cn|{name}|captures', *to_remove)
2024-03-11 00:14:07 +01:00
is_reindex = True
self.logger.debug(f'reindexing cookies for {crawled_tree.uuid} ... ')
2023-07-21 15:48:20 +02:00
already_cleaned_up.add(name)
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
2024-03-11 00:14:07 +01:00
if not is_reindex:
pipeline.zincrby('cookies_names', 1, name)
pipeline.zincrby(f'cn|{name}', 1, domain)
pipeline.zincrby(f'cn|{name}|{domain}', 1, value)
pipeline.sadd(domain, name)
pipeline.execute()
2024-03-11 00:14:07 +01:00
self.logger.debug(f'done with cookies for {crawled_tree.uuid}.')
2020-10-09 18:05:04 +02:00
# ###### Body hashes ######
@property
2024-01-12 17:15:41 +01:00
def ressources(self) -> list[tuple[str, float]]:
2020-10-09 18:05:04 +02:00
return self.redis.zrevrange('body_hashes', 0, 200, withscores=True)
def ressources_number_domains(self, h: str) -> int:
return self.redis.zcard(f'bh|{h}')
2024-01-12 17:15:41 +01:00
def body_hash_fequency(self, body_hash: str) -> dict[str, int]:
pipeline = self.redis.pipeline()
pipeline.zscore('body_hashes', body_hash)
pipeline.zcard(f'bh|{body_hash}')
hash_freq, hash_domains_freq = pipeline.execute()
to_return = {'hash_freq': 0, 'hash_domains_freq': 0}
if hash_freq:
to_return['hash_freq'] = int(hash_freq)
if hash_domains_freq:
to_return['hash_domains_freq'] = int(hash_domains_freq)
return to_return
2020-10-09 18:05:04 +02:00
def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
# Do not reindex
return
self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)
2024-03-11 00:14:07 +01:00
self.logger.debug(f'Indexing body hashes for {crawled_tree.uuid} ... ')
2020-10-09 18:05:04 +02:00
2024-03-11 00:14:07 +01:00
cleaned_up_hashes: set[str] = set()
2020-10-09 18:05:04 +02:00
pipeline = self.redis.pipeline()
2024-03-11 00:14:07 +01:00
is_reindex = False
2020-10-09 18:05:04 +02:00
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
for h in urlnode.resources_hashes:
2024-03-11 00:14:07 +01:00
if h not in cleaned_up_hashes:
# Delete the hash for that capture the first time we see it.
if self.redis.exists(f'bh|{h}|captures|{crawled_tree.uuid}'):
pipeline.delete(f'bh|{h}|captures|{crawled_tree.uuid}')
cleaned_up_hashes.add(h)
is_reindex = True
self.logger.debug(f'reindexing body hashes for {crawled_tree.uuid} ... ')
2020-10-09 18:05:04 +02:00
# ZSet of all urlnode_UUIDs|full_url
2020-10-23 20:51:15 +02:00
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
2024-03-11 00:14:07 +01:00
if not is_reindex:
pipeline.zincrby('body_hashes', 1, h)
pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname)
# set of all captures with this hash
pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
2020-10-09 18:05:04 +02:00
pipeline.execute()
2024-03-11 00:14:07 +01:00
self.logger.debug(f'done with body hashes for {crawled_tree.uuid}.')
2020-10-09 18:05:04 +02:00
2024-01-12 17:15:41 +01:00
def get_hash_uuids(self, body_hash: str) -> tuple[str, str, str]:
2023-07-21 15:48:20 +02:00
"""Use that to get a reference allowing to fetch a resource from one of the capture."""
2024-01-12 17:15:41 +01:00
capture_uuid = str(self.redis.srandmember(f'bh|{body_hash}|captures'))
2020-10-09 18:05:04 +02:00
entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0]
urlnode_uuid, hostnode_uuid, url = entry.split('|', 2)
return capture_uuid, urlnode_uuid, hostnode_uuid
2024-01-12 17:15:41 +01:00
def get_body_hash_captures(self, body_hash: str, filter_url: str | None=None,
filter_capture_uuid: str | None=None,
2022-07-27 14:36:56 +02:00
limit: int=20,
prefered_uuids: set[str]=set()) -> tuple[int, list[tuple[str, str, str, bool, str]]]:
2022-07-27 14:36:56 +02:00
'''Get the captures matching the hash.
2023-11-06 16:16:09 +01:00
2022-07-27 14:36:56 +02:00
:param filter_url: URL of the hash we're searching for
:param filter_capture_uuid: UUID of the capture the hash was found in
:param limit: Max matching captures to return, -1 means unlimited.
2022-07-27 14:36:56 +02:00
:param prefered_uuids: UUID cached right now, so we don't rebuild trees.
'''
to_return: list[tuple[str, str, str, bool, str]] = []
2023-11-06 16:16:09 +01:00
len_captures = self.redis.scard(f'bh|{body_hash}|captures')
unlimited = False
if limit == -1:
unlimited = True
2023-11-06 16:16:09 +01:00
for capture_uuid in self.redis.sscan_iter(f'bh|{body_hash}|captures'):
2020-10-09 18:05:04 +02:00
if capture_uuid == filter_capture_uuid:
# Used to skip hits in current capture
len_captures -= 1
continue
2022-07-27 14:36:56 +02:00
if prefered_uuids and capture_uuid not in prefered_uuids:
continue
if not unlimited:
limit -= 1
2020-10-09 18:05:04 +02:00
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
url_uuid, hostnode_uuid, url = entry.split('|', 2)
2020-10-12 12:15:07 +02:00
hostname: str = urlsplit(url).hostname
2020-10-09 18:05:04 +02:00
if filter_url:
to_return.append((capture_uuid, hostnode_uuid, hostname, url == filter_url, url))
2020-10-09 18:05:04 +02:00
else:
to_return.append((capture_uuid, hostnode_uuid, hostname, False, url))
if not unlimited and limit <= 0:
2023-11-06 16:16:09 +01:00
break
2020-10-09 18:05:04 +02:00
return len_captures, to_return
2024-01-12 17:15:41 +01:00
def get_body_hash_domains(self, body_hash: str) -> list[tuple[str, float]]:
2020-10-09 18:05:04 +02:00
return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)
2020-10-23 20:51:15 +02:00
2024-01-12 17:15:41 +01:00
def get_body_hash_urls(self, body_hash: str) -> dict[str, list[dict[str, str]]]:
all_captures: set[str] = self.redis.smembers(f'bh|{body_hash}|captures')
2020-10-23 20:51:15 +02:00
urls = defaultdict(list)
for capture_uuid in list(all_captures):
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
url_uuid, hostnode_uuid, url = entry.split('|', 2)
urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid})
return urls
2020-10-27 00:02:18 +01:00
2023-07-21 15:48:20 +02:00
# ###### HTTP Headers Hashes ######
@property
2024-01-12 17:15:41 +01:00
def http_headers_hashes(self) -> list[tuple[str, float]]:
2023-07-21 15:48:20 +02:00
return self.redis.zrevrange('hhhashes', 0, -1, withscores=True)
def http_headers_hashes_number_captures(self, hhh: str) -> int:
return self.redis.scard(f'hhhashes|{hhh}|captures')
2024-01-12 17:15:41 +01:00
def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]:
2023-07-21 15:48:20 +02:00
return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')]
2024-03-11 00:14:07 +01:00
def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid):
# Do not reindex
return
self.redis.sadd('indexed_hhhashes', crawled_tree.uuid)
self.logger.debug(f'Indexing http headers hashes for {crawled_tree.uuid} ... ')
2023-07-21 15:48:20 +02:00
pipeline = self.redis.pipeline()
2024-01-12 17:15:41 +01:00
already_loaded: set[str] = set()
already_cleaned_up: set[str] = set()
2024-03-11 00:14:07 +01:00
is_reindex = False
2023-07-21 15:48:20 +02:00
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if 'hhhash' not in urlnode.features:
2023-07-21 15:48:20 +02:00
continue
if urlnode.hhhash in already_loaded:
# Only add cookie name once / capture
continue
already_loaded.add(urlnode.hhhash)
if urlnode.hhhash not in already_cleaned_up:
# We only run this srem once per name for a capture,
# before adding it for the first time
to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')]
if to_remove:
pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove)
2024-03-11 00:14:07 +01:00
is_reindex = True
self.logger.debug(f'reindexing http headers hashes for {crawled_tree.uuid} ... ')
2023-07-21 15:48:20 +02:00
already_cleaned_up.add(urlnode.hhhash)
pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
2024-03-11 00:14:07 +01:00
if not is_reindex:
pipeline.zincrby('hhhashes', 1, urlnode.hhhash)
2023-07-21 15:48:20 +02:00
pipeline.execute()
2024-03-11 00:14:07 +01:00
self.logger.debug(f'done with http headers hashes for {crawled_tree.uuid}.')
2023-07-21 15:48:20 +02:00
2020-10-27 00:02:18 +01:00
# ###### URLs and Domains ######
@property
2024-01-12 17:15:41 +01:00
def urls(self) -> list[tuple[str, float]]:
2020-10-27 00:02:18 +01:00
return self.redis.zrevrange('urls', 0, 200, withscores=True)
@property
2024-01-12 17:15:41 +01:00
def hostnames(self) -> list[tuple[str, float]]:
2020-10-27 00:02:18 +01:00
return self.redis.zrevrange('hostnames', 0, 200, withscores=True)
def index_url_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_urls', crawled_tree.uuid):
# Do not reindex
2020-10-27 00:02:18 +01:00
return
self.redis.sadd('indexed_urls', crawled_tree.uuid)
2024-03-11 00:14:07 +01:00
self.logger.debug(f'Indexing URLs for {crawled_tree.uuid} ... ')
2020-10-27 00:02:18 +01:00
pipeline = self.redis.pipeline()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if not urlnode.hostname or not urlnode.name:
continue
2024-03-11 00:14:07 +01:00
if not self.redis.sismember(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid):
pipeline.zincrby('hostnames', 1, urlnode.hostname)
pipeline.zincrby('urls', 1, urlnode.name)
pipeline.sadd(f'hostnames|{urlnode.hostname}|captures', crawled_tree.uuid)
# set of all captures with this URL
# We need to make sure the keys in redis aren't too long.
md5 = hashlib.md5(urlnode.name.encode()).hexdigest()
pipeline.sadd(f'urls|{md5}|captures', crawled_tree.uuid)
2020-10-27 00:02:18 +01:00
pipeline.execute()
2024-03-11 00:14:07 +01:00
self.logger.debug(f'done with URLs for {crawled_tree.uuid}.')
2020-10-27 00:02:18 +01:00
2024-01-12 17:15:41 +01:00
def get_captures_url(self, url: str) -> set[str]:
2023-07-21 15:48:20 +02:00
md5 = hashlib.md5(url.encode()).hexdigest()
return self.redis.smembers(f'urls|{md5}|captures')
2020-10-27 00:02:18 +01:00
2024-01-12 17:15:41 +01:00
def get_captures_hostname(self, hostname: str) -> set[str]:
2021-06-09 21:04:26 +02:00
return self.redis.smembers(f'hostnames|{hostname}|captures')
2024-02-19 16:15:52 +01:00
# ###### favicons ######
@property
def favicons(self) -> list[tuple[str, float]]:
return self.redis.zrevrange('favicons', 0, 200, withscores=True)
def favicon_frequency(self, favicon_sha512: str) -> float | None:
return self.redis.zscore('favicons', favicon_sha512)
2024-02-19 16:15:52 +01:00
def favicon_number_captures(self, favicon_sha512: str) -> int:
return self.redis.scard(f'favicons|{favicon_sha512}|captures')
def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
if self.redis.sismember('indexed_favicons', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_favicons', capture_uuid)
2024-03-11 00:14:07 +01:00
self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
2024-02-19 16:15:52 +01:00
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
2024-03-11 00:14:07 +01:00
if not self.redis.sismember('favicons|{sha}|captures', capture_uuid):
# Do not count the same favicon more than once for the same capture
pipeline.zincrby('favicons', 1, sha)
pipeline.sadd(f'favicons|{sha}|captures', capture_uuid)
# There is no easi access to the favicons unless we store them in redis
pipeline.set(f'favicons|{sha}', favicon)
2024-02-19 16:15:52 +01:00
pipeline.execute()
def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
return self.redis.smembers(f'favicons|{favicon_sha512}|captures')
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
# ###### Capture hashes ######
# This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture)
# certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page
def _compute_certpl_html_structure_hash(self, html: str) -> str:
soup = BeautifulSoup(html, "lxml")
to_hash = "|".join(t.name for t in soup.findAll()).encode()
return sha256(to_hash).hexdigest()[:32]
def captures_hashes_types(self) -> set[str]:
2024-04-11 18:02:09 +02:00
return {'certpl_html_structure_hash'}
# return self.redis.smembers('capture_hash_types')
def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]:
return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True)
def hash_frequency(self, hash_type: str, h: str) -> float | None:
return self.redis.zscore(f'capture_hash_types|{hash_type}', h)
def hash_number_captures(self, hash_type: str, h: str) -> int:
return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures')
def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid
# NOTE: We will have multiple hash types for each captures, we want to make sure
# to reindex all the captures if there is a new hash type but only index the new
# captures on the existing hash types
# hashes = ('certpl_html_structure_hash', )
for hash_type in self.captures_hashes_types():
if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid)
if hash_type == 'certpl_html_structure_hash':
# we must have a rendered HTML for this hash to be relevant.
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html')
or not crawled_tree.root_hartree.rendered_node.rendered_html):
continue
# we have a rendered HTML, compute the hash
hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html)
else:
self.logger.warning(f'Unknown hash type: {hash_type}')
continue
if not hash_to_index:
self.logger.info(f'No hash to index for {hash_type} in {capture_uuid} ... ')
continue
if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid):
# Already counted this specific identifier for this capture
continue
self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ')
pipeline = self.redis.pipeline()
pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index)
pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid)
pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index)
pipeline.execute()
def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]:
return self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]:
return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures')
2024-03-14 00:56:28 +01:00
# ###### identifiers ######
def identifiers_types(self) -> set[str]:
return self.redis.smembers('identifiers_types')
def identifiers(self, identifier_type: str) -> list[tuple[str, float]]:
return self.redis.zrevrange(f'identifiers|{identifier_type}', 0, 200, withscores=True)
def identifier_frequency(self, identifier_type: str, identifier: str) -> float | None:
return self.redis.zscore(f'identifiers|{identifier_type}', identifier)
def identifier_number_captures(self, identifier_type: str, identifier: str) -> int:
return self.redis.scard(f'identifiers|{identifier_type}|{identifier}|captures')
def index_identifiers_capture(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid
if self.redis.sismember('indexed_identifiers', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_identifiers', capture_uuid)
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'identifiers')
or not crawled_tree.root_hartree.rendered_node.identifiers):
return
pipeline = self.redis.pipeline()
# We have multiple identifiers types, this is the difference with the other indexes
for identifier_type, id_values in crawled_tree.root_hartree.rendered_node.identifiers.items():
pipeline.sadd('identifiers_types', identifier_type) # no-op if already there
if self.redis.sismember(f'indexed_identifiers|{identifier_type}|captures', capture_uuid):
# Do not reindex the same identifier type for the same capture
continue
pipeline.sadd(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
self.logger.debug(f'Indexing identifiers {identifier_type} for {capture_uuid} ... ')
for identifier in id_values:
if self.redis.sismember(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid):
# Already counted this specific identifier for this capture
continue
pipeline.sadd(f'identifiers|{capture_uuid}', identifier_type)
pipeline.sadd(f'identifiers|{capture_uuid}|{identifier_type}', identifier)
pipeline.sadd(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid)
pipeline.zincrby(f'identifiers|{identifier_type}', 1, identifier)
pipeline.execute()
def get_identifiers_capture(self, capture_uuid: str) -> dict[str, set[str]]:
to_return = {}
for identifier_type in self.redis.smembers(f'identifiers|{capture_uuid}'):
to_return[identifier_type] = self.redis.smembers(f'identifiers|{capture_uuid}|{identifier_type}')
return to_return
def get_captures_identifier(self, identifier_type: str, identifier: str) -> set[str]:
return self.redis.smembers(f'identifiers|{identifier_type}|{identifier}|captures')
2024-02-26 17:07:23 +01:00
# ###### favicons probabilistic hashes ######
def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
return self.redis.zscore(f'favicons|{algorithm}', phash)
def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
2024-03-11 00:14:07 +01:00
# FIXME: this method isnt used anymore
2024-02-26 17:07:23 +01:00
if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
if algorithm == 'mmh3-shodan':
# Shodan uses a weird technique:
# 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
# 2. hashes the base64 string with mmh3
b64 = base64.encodebytes(favicon)
h = str(mmh3.hash(b64))
else:
raise NotImplementedError(f'Unknown algorithm: {algorithm}')
pipeline.zincrby(f'favicons|{algorithm}', 1, h)
# All captures with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
# All hashes with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
# reverse lookup to get probabilistic hashes related to a specific favicon
pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
pipeline.execute()
def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the favicon sha512 for this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')
def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
'''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')
def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the captures with this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')
# ###### Categories ######
@property
2024-01-12 17:15:41 +01:00
def categories(self) -> list[tuple[str, int]]:
return [(c, int(score))
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
2024-01-12 17:15:41 +01:00
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]) -> None:
if not categories:
return
if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex
return
self.redis.sadd('indexed_categories', capture_uuid)
if not categories:
return
pipeline = self.redis.pipeline()
for category in categories:
pipeline.zincrby('categories', 1, category)
pipeline.sadd(category, capture_uuid)
pipeline.execute()
2024-01-12 17:15:41 +01:00
def get_captures_category(self, category: str) -> set[str]:
2021-06-09 21:04:26 +02:00
return self.redis.smembers(category)