chg: cleanup in the mail lookyloo class

2021-08-24 18:32:54 +02:00 · 2021-08-24 18:32:54 +02:00 · 81390d5ea0
parent 8433cbcc1b
commit 81390d5ea0
6 changed files with 203 additions and 163 deletions
--- a/bin/archiver.py
+++ b/bin/archiver.py
@ -25,7 +25,7 @@ class Archiver(AbstractManager):
        self.redis = Redis(unix_socket_path=get_socket_path('cache'))
        # make sure archived captures dir exists
-        self.archived_captures_dir = get_homedir / 'archived_captures'
+        self.archived_captures_dir = get_homedir() / 'archived_captures'
        self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
        self._load_archives()
@ -78,8 +78,8 @@ class Archiver(AbstractManager):
        if archived_uuids:
            p = self.redis.pipeline()
-            p.redis.hdel('lookup_dirs', *archived_uuids.keys())
+            p.hdel('lookup_dirs', *archived_uuids.keys())
-            p.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
+            p.hmset('lookup_dirs_archived', archived_uuids)  # type: ignore
            p.execute()
        self.logger.info('Archiving done.')
@ -91,8 +91,8 @@ class Archiver(AbstractManager):
                if not (month / 'index').exists():
                    continue
                with (month / 'index').open('r') as _f:
-                    archived_uuids = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
+                    archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
-                self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
+                self.redis.hmset('lookup_dirs_archived', archived_uuids)  # type: ignore
 def main():
--- a/bin/background_processing.py
+++ b/bin/background_processing.py
@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import json
 import logging
 from collections import Counter
 from datetime import timedelta, date
 from typing import Dict, Any
 from redis import Redis
 from werkzeug.useragents import UserAgent
 from lookyloo.abstractmanager import AbstractManager
 from lookyloo.helpers import (get_config, get_homedir, get_socket_path,
                              safe_create_dir)
 logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
                    level=logging.INFO, datefmt='%I:%M:%S')
 class Processing(AbstractManager):
    def __init__(self, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        self.script_name = 'archiver'
        self.use_own_ua = get_config('generic', 'use_user_agents_users')
    def _to_run_forever(self):
        if self.use_own_ua:
            self._build_ua_file()
    def _build_ua_file(self):
        '''Build a file in a format compatible with the capture page'''
        yesterday = (date.today() - timedelta(days=1))
        self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
        safe_create_dir(self_generated_ua_file_path)
        self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
        if self_generated_ua_file.exists():
            return
        redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
        if not entries:
            return
        to_store: Dict[str, Any] = {'by_frequency': []}
        uas = Counter([entry.split('|', 1)[1] for entry in entries])
        for ua, _ in uas.most_common():
            parsed_ua = UserAgent(ua)
            if not parsed_ua.platform or not parsed_ua.browser:
                continue
            if parsed_ua.platform not in to_store:
                to_store[parsed_ua.platform] = {}
            if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]:
                to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = []
            to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string)
            to_store['by_frequency'].append({'os': parsed_ua.platform,
                                             'browser': f'{parsed_ua.browser} {parsed_ua.version}',
                                             'useragent': parsed_ua.string})
        with self_generated_ua_file.open('w') as f:
            json.dump(to_store, f, indent=2)
        # Remove the UA / IP mapping.
        redis.delete(f'user_agents|{yesterday.isoformat()}')
 def main():
    p = Processing()
    p.run(sleep_in_sec=3600 * 24)
 if __name__ == '__main__':
    main()
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -103,7 +103,7 @@ Run the following command (assuming you run the code from the clonned repository
@lru_cache(64)
-def get_capture_dir() -> Path:
+def get_captures_dir() -> Path:
    capture_dir = get_homedir() / 'scraped'
    safe_create_dir(capture_dir)
    return capture_dir
@ -365,13 +365,15 @@ def get_useragent_for_requests():
 def reload_uuids_index() -> None:
-    recent_uuids = {}
+    recent_uuids: Dict[str, str] = {}
-    for uuid_path in sorted(get_capture_dir().glob('*/uuid'), reverse=True):
+    for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
        with uuid_path.open() as f:
            uuid = f.read()
        recent_uuids[uuid] = str(uuid_path.parent)
    if not recent_uuids:
        return None
    r = Redis(unix_socket_path=get_socket_path('cache'))
    p = r.pipeline()
    p.delete('lookup_dirs')
-    p.hset('lookup_dirs', mapping=recent_uuids)
+    p.hset('lookup_dirs', mapping=recent_uuids)  # type: ignore
    p.execute()
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -3,8 +3,8 @@
 import os
 import base64
-from collections import defaultdict, Counter
+from collections import defaultdict
-from datetime import datetime, date, timedelta
+from datetime import datetime, date
 from email.message import EmailMessage
 from io import BufferedIOBase, BytesIO
 import ipaddress
@ -106,37 +106,32 @@ class Lookyloo():
        today = date.today().isoformat()
        self.redis.zincrby(f'user_agents|{today}', 1, f'{remote_ip}|{user_agent}')
-    def build_ua_file(self) -> None:
+    def _get_capture_dir(self, capture_uuid: str, /) -> Path:
-        '''Build a file in a format compatible with the capture page'''
+        '''Use the cache to get a capture directory from a capture UUID'''
-        yesterday = (date.today() - timedelta(days=1))
+        capture_dir: Optional[Union[str, Path]]
-        self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
+        if capture_uuid in self._captures_index:
-        safe_create_dir(self_generated_ua_file_path)
+            capture_dir = self._captures_index[capture_uuid].capture_dir
-        self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
+            if capture_dir.exists():
-        if self_generated_ua_file.exists():
+                return capture_dir
-            return
+            self.redis.delete(capture_dir)
-        entries = self.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
+            self._captures_index.pop(capture_uuid)
-        if not entries:
+        capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
-            return
+        if capture_dir and not Path(capture_dir).exists():
-
+            # The capture was either removed or archived, cleaning up
-        to_store: Dict[str, Any] = {'by_frequency': []}
+            self.redis.hdel('lookup_dirs', capture_uuid)
-        uas = Counter([entry.split('|', 1)[1] for entry in entries])
+            capture_dir = None
-        for ua, _ in uas.most_common():
+        if not capture_dir:
-            parsed_ua = UserAgent(ua)
+            # Try in the archive
-            if not parsed_ua.platform or not parsed_ua.browser:
+            capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
-                continue
+            if not capture_dir:
-            if parsed_ua.platform not in to_store:
+                raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
-                to_store[parsed_ua.platform] = {}
+        to_return = Path(capture_dir)
-            if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]:
+        if not to_return.exists():
-                to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = []
+            # The capture was removed, remove the UUID
-            to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string)
+            self.redis.hdel('lookup_dirs_archived', capture_uuid)
-            to_store['by_frequency'].append({'os': parsed_ua.platform,
+            self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
-                                             'browser': f'{parsed_ua.browser} {parsed_ua.version}',
+            raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
-                                             'useragent': parsed_ua.string})
+        return to_return
        with self_generated_ua_file.open('w') as f:
            json.dump(to_store, f, indent=2)
        # Remove the UA / IP mapping.
        self.redis.delete(f'user_agents|{yesterday.isoformat()}')
    def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
        '''Generate the pickle, set the cache, add capture in the indexes'''
@ -197,6 +192,85 @@ class Lookyloo():
            lock_file.unlink(missing_ok=True)
        return ct
    def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
        '''Populate the redis cache for a capture. Mostly used on the index page.
        NOTE: Doesn't require the pickle.'''
        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()
        har_files = sorted(capture_dir.glob('*.har'))
        error_cache: Dict[str, str] = {}
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (capture_dir / 'error.txt').open() as _error:
                content = _error.read()
                try:
                    error_to_cache = json.loads(content)
                    if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
                        error_to_cache = error_to_cache.get('details')
                except json.decoder.JSONDecodeError:
                    # old format
                    error_to_cache = content
                error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
        fatal_error = False
        if har_files:
            try:
                har = HarFile(har_files[0], uuid)
            except Har2TreeError as e:
                error_cache['error'] = str(e)
                fatal_error = True
        else:
            error_cache['error'] = f'No har files in {capture_dir.name}'
            fatal_error = True
        if (capture_dir / 'categories').exists():
            with (capture_dir / 'categories').open() as _categories:
                categories = [c.strip() for c in _categories.readlines()]
        else:
            categories = []
        p = self.redis.pipeline()
        p.hset('lookup_dirs', uuid, str(capture_dir))
        if error_cache:
            if 'HTTP Error' not in error_cache['error']:
                self.logger.warning(error_cache['error'])
            p.hmset(str(capture_dir), error_cache)
        if not fatal_error:
            redirects = har.initial_redirects
            incomplete_redirects = False
            if redirects and har.need_tree_redirects:
                # load tree from disk, get redirects
                ct = load_pickle_tree(capture_dir)
                if ct:
                    redirects = ct.redirects
                else:
                    # Pickle not available
                    incomplete_redirects = True
            cache: Dict[str, Union[str, int]] = {'uuid': uuid,
                                                 'title': har.initial_title,
                                                 'timestamp': har.initial_start_time,
                                                 'url': har.root_url,
                                                 'redirects': json.dumps(redirects),
                                                 'categories': json.dumps(categories),
                                                 'capture_dir': str(capture_dir),
                                                 'incomplete_redirects': 1 if incomplete_redirects else 0}
            if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
                cache['no_index'] = 1
            if (capture_dir / 'parent').exists():  # The capture was initiated from an other one
                with (capture_dir / 'parent').open() as f:
                    cache['parent'] = f.read().strip()
            p.hmset(str(capture_dir), cache)
        p.execute()
        # If the cache is re-created for some reason, pop from the local cache.
        self._captures_index.pop(uuid, None)
        return cache
    def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
        '''Returns a list of CNAMEs starting from one hostname.
        The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
@ -259,17 +333,6 @@ class Lookyloo():
            json.dump(host_ips, f)
        return ct
    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
        '''Get the generated tree in ETE Toolkit format.
        Loads the pickle if it exists, creates it otherwise.'''
        capture_dir = self._get_capture_dir(capture_uuid)
        ct = load_pickle_tree(capture_dir)
        if not ct:
            ct = self._cache_capture(capture_uuid)
        if not ct:
            raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
        return ct
    def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
                    legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
        '''Adds context information to a capture or a URL node'''
@ -449,85 +512,6 @@ class Lookyloo():
                to_return[event_id].update(values)
        return to_return
    def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
        '''Populate the redis cache for a capture. Mostly used on the index page.
        NOTE: Doesn't require the pickle.'''
        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()
        har_files = sorted(capture_dir.glob('*.har'))
        error_cache: Dict[str, str] = {}
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (capture_dir / 'error.txt').open() as _error:
                content = _error.read()
                try:
                    error_to_cache = json.loads(content)
                    if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
                        error_to_cache = error_to_cache.get('details')
                except json.decoder.JSONDecodeError:
                    # old format
                    error_to_cache = content
                error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
        fatal_error = False
        if har_files:
            try:
                har = HarFile(har_files[0], uuid)
            except Har2TreeError as e:
                error_cache['error'] = str(e)
                fatal_error = True
        else:
            error_cache['error'] = f'No har files in {capture_dir.name}'
            fatal_error = True
        if (capture_dir / 'categories').exists():
            with (capture_dir / 'categories').open() as _categories:
                categories = [c.strip() for c in _categories.readlines()]
        else:
            categories = []
        p = self.redis.pipeline()
        p.hset('lookup_dirs', uuid, str(capture_dir))
        if error_cache:
            if 'HTTP Error' not in error_cache['error']:
                self.logger.warning(error_cache['error'])
            p.hmset(str(capture_dir), error_cache)
        if not fatal_error:
            redirects = har.initial_redirects
            incomplete_redirects = False
            if redirects and har.need_tree_redirects:
                # load tree from disk, get redirects
                ct = load_pickle_tree(capture_dir)
                if ct:
                    redirects = ct.redirects
                else:
                    # Pickle not available
                    incomplete_redirects = True
            cache: Dict[str, Union[str, int]] = {'uuid': uuid,
                                                 'title': har.initial_title,
                                                 'timestamp': har.initial_start_time,
                                                 'url': har.root_url,
                                                 'redirects': json.dumps(redirects),
                                                 'categories': json.dumps(categories),
                                                 'capture_dir': str(capture_dir),
                                                 'incomplete_redirects': 1 if incomplete_redirects else 0}
            if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
                cache['no_index'] = 1
            if (capture_dir / 'parent').exists():  # The capture was initiated from an other one
                with (capture_dir / 'parent').open() as f:
                    cache['parent'] = f.read().strip()
            p.hmset(str(capture_dir), cache)
        p.execute()
        # If the cache is re-created for some reason, pop from the local cache.
        self._captures_index.pop(uuid, None)
        return cache
    def hide_capture(self, capture_uuid: str, /) -> None:
        """Add the capture in the hidden pool (not shown on the front page)
        NOTE: it won't remove the correlations until they are rebuilt.
@ -599,32 +583,16 @@ class Lookyloo():
            self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
            return None
-    def _get_capture_dir(self, capture_uuid: str, /) -> Path:
+    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
-        '''Use the cache to get a capture directory from a capture UUID'''
+        '''Get the generated tree in ETE Toolkit format.
-        capture_dir: Optional[Union[str, Path]]
+        Loads the pickle if it exists, creates it otherwise.'''
-        if capture_uuid in self._captures_index:
+        capture_dir = self._get_capture_dir(capture_uuid)
-            capture_dir = self._captures_index[capture_uuid].capture_dir
+        ct = load_pickle_tree(capture_dir)
-            if capture_dir.exists():
+        if not ct:
-                return capture_dir
+            ct = self._cache_capture(capture_uuid)
-            self.redis.delete(capture_dir)
+        if not ct:
-            self._captures_index.pop(capture_uuid)
+            raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
-        capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
+        return ct
        if capture_dir and not Path(capture_dir).exists():
            # The capture was either removed or archived, cleaning up
            self.redis.hdel('lookup_dirs', capture_uuid)
            capture_dir = None
        if not capture_dir:
            # Try in the archive
            capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
            if not capture_dir:
                raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
        to_return = Path(capture_dir)
        if not to_return.exists():
            # The capture was removed, remove the UUID
            self.redis.hdel('lookup_dirs_archived', capture_uuid)
            self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
            raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
        return to_return
    def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
        redis = self.redis  # use a single connection
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,6 +32,7 @@ rebuild_caches = "bin.rebuild_caches:main"
 update = "bin.update:main"
 background_indexer = "bin.background_indexer:main"
 archiver = "bin.archiver:main"
 processing = "bin.background_processing:main"
 [tool.poetry.dependencies]
--- a/website/web/init.py
+++ b/website/web/init.py
@ -21,7 +21,7 @@ from werkzeug.security import check_password_hash
 from pymisp import MISPEvent, MISPServerError
-from lookyloo.helpers import (update_user_agents, get_user_agents, get_config,
+from lookyloo.helpers import (get_user_agents, get_config,
                              get_taxonomies, load_cookies, CaptureStatus)
 from lookyloo.lookyloo import Lookyloo, Indexing
 from lookyloo.exceptions import NoValidHarFile, MissingUUID
@ -679,10 +679,6 @@ def index():
    if request.method == 'HEAD':
        # Just returns ack if the webserver is running
        return 'Ack'
    if use_own_ua:
        lookyloo.build_ua_file()
    else:
        update_user_agents()
    show_error, category = get_index_params(request)
    return index_generic(show_error=show_error)