From 67b41ca8fbf0dc5452a27f3413a5cac042852dd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 15 Jun 2020 16:12:23 +0200 Subject: [PATCH] chg: Improve intergration of cookies indexing --- bin/rebuild_caches.py | 6 +++- lookyloo/indexing.py | 68 ----------------------------------------- lookyloo/lookyloo.py | 71 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 69 deletions(-) delete mode 100644 lookyloo/indexing.py diff --git a/bin/rebuild_caches.py b/bin/rebuild_caches.py index 5f8ecf07..db5df912 100755 --- a/bin/rebuild_caches.py +++ b/bin/rebuild_caches.py @@ -4,7 +4,7 @@ import argparse import logging -from lookyloo.lookyloo import Lookyloo +from lookyloo.lookyloo import Lookyloo, Indexing logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S') @@ -19,3 +19,7 @@ if __name__ == '__main__': lookyloo.rebuild_all() else: lookyloo.rebuild_cache() + + indexing = Indexing() + indexing.clear_indexes() + indexing.index_all() diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py deleted file mode 100644 index 289b158e..00000000 --- a/lookyloo/indexing.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from typing import Set, Tuple, List, Optional, Dict, Any - -from redis import Redis - -from .helpers import get_socket_path -from .lookyloo import Lookyloo - - -class Indexing(): - - def __init__(self) -> None: - self.lookyloo = Lookyloo() - self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True) - - @property - def cookies_names(self) -> List[Tuple[str, float]]: - return self.redis.zrevrange('cookies_names', 0, -1, withscores=True) - - def cookies_names_number_domains(self, cookie_name: str) -> int: - return self.redis.zcard(f'cn|{cookie_name}') - - def cookies_names_domains_values(self, cookie_name: str, domain: str) -> List[Tuple[str, float]]: - return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True) - - def get_cookie_domains(self, cookie_name: str) -> List[Tuple[str, float]]: - return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True) - - def get_capture_cache(self, capture_uuid: str) -> Optional[Dict[str, Any]]: - capture_dir = self.lookyloo.lookup_capture_dir(capture_uuid) - if capture_dir: - return self.lookyloo.capture_cache(capture_dir) - return {} - - def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]: - return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] - - def index_cookies(self) -> None: - for capture_dir in self.lookyloo.capture_dirs: - print(f'Processing {capture_dir}') - try: - crawled_tree = self.lookyloo.get_crawled_tree(capture_dir) - except Exception as e: - print(e) - continue - pipeline = self.redis.pipeline() - already_loaded: Set[Tuple[str, str]] = set() - for urlnode in crawled_tree.root_hartree.url_tree.traverse(): - if hasattr(urlnode, 'cookies_received'): - for domain, cookie, _ in urlnode.cookies_received: - name, value = cookie.split('=', 1) - if (name, domain) in already_loaded: - # Only add cookie name once / capture - continue - already_loaded.add((name, domain)) - pipeline.zincrby('cookies_names', 1, name) - pipeline.zincrby(f'cn|{name}', 1, domain) - pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') - pipeline.zincrby(f'cn|{name}|{domain}', 1, value) - - pipeline.sadd('lookyloo_domains', domain) - pipeline.sadd(domain, name) - - # pipeline.zincrby('lookyloo_cookies_index_values', 1, value) - # pipeline.zincrby(value, 1, name) - pipeline.execute() diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index f8098615..e40c35a2 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -32,6 +32,77 @@ from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, s from .modules import VirusTotal, SaneJavaScript, PhishingInitiative +class Indexing(): + + def __init__(self) -> None: + self.lookyloo = Lookyloo() + self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True) + + @property + def cookies_names(self) -> List[Tuple[str, float]]: + return self.redis.zrevrange('cookies_names', 0, -1, withscores=True) + + def cookies_names_number_domains(self, cookie_name: str) -> int: + return self.redis.zcard(f'cn|{cookie_name}') + + def cookies_names_domains_values(self, cookie_name: str, domain: str) -> List[Tuple[str, float]]: + return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True) + + def get_cookie_domains(self, cookie_name: str) -> List[Tuple[str, float]]: + return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True) + + def get_capture_cache(self, capture_uuid: str) -> Optional[Dict[str, Any]]: + capture_dir = self.lookyloo.lookup_capture_dir(capture_uuid) + if capture_dir: + return self.lookyloo.capture_cache(capture_dir) + return {} + + def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]: + return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] + + def clear_indexes(self): + self.redis.flushdb() + + def index_all(self): + self.index_cookies() + + def index_cookies_capture(self, capture_dir: Path) -> None: + print(f'Processing {capture_dir}') + try: + crawled_tree = self.lookyloo.get_crawled_tree(capture_dir) + except Exception as e: + print(e) + return + + if self.redis.sismember('indexed_cookies', crawled_tree.uuid): + # Do not reindex + return + self.redis.sadd('indexed_cookies', crawled_tree.uuid) + + pipeline = self.redis.pipeline() + already_loaded: Set[Tuple[str, str]] = set() + for urlnode in crawled_tree.root_hartree.url_tree.traverse(): + if hasattr(urlnode, 'cookies_received'): + for domain, cookie, _ in urlnode.cookies_received: + name, value = cookie.split('=', 1) + if (name, domain) in already_loaded: + # Only add cookie name once / capture + continue + already_loaded.add((name, domain)) + pipeline.zincrby('cookies_names', 1, name) + pipeline.zincrby(f'cn|{name}', 1, domain) + pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') + pipeline.zincrby(f'cn|{name}|{domain}', 1, value) + + pipeline.sadd('lookyloo_domains', domain) + pipeline.sadd(domain, name) + pipeline.execute() + + def index_cookies(self) -> None: + for capture_dir in self.lookyloo.capture_dirs: + self.index_cookies_capture(capture_dir) + + class Lookyloo(): def __init__(self) -> None: