chg: cleanup in the mail lookyloo class

2021-08-24 18:32:54 +02:00 · 2021-08-24 18:32:54 +02:00 · 81390d5ea0
parent 8433cbcc1b
commit 81390d5ea0
6 changed files with 203 additions and 163 deletions
--- a/bin/archiver.py
+++ b/bin/archiver.py
@ -25,7 +25,7 @@ class Archiver(AbstractManager):
        self.redis = Redis(unix_socket_path=get_socket_path('cache'))

        # make sure archived captures dir exists
-        self.archived_captures_dir = get_homedir / 'archived_captures'
+        self.archived_captures_dir = get_homedir() / 'archived_captures'
        self.archived_captures_dir.mkdir(parents=True, exist_ok=True)

        self._load_archives()
@ -78,8 +78,8 @@ class Archiver(AbstractManager):

        if archived_uuids:
            p = self.redis.pipeline()
-            p.redis.hdel('lookup_dirs', *archived_uuids.keys())
-            p.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
+            p.hdel('lookup_dirs', *archived_uuids.keys())
+            p.hmset('lookup_dirs_archived', archived_uuids)  # type: ignore
            p.execute()
        self.logger.info('Archiving done.')

@ -91,8 +91,8 @@ class Archiver(AbstractManager):
                if not (month / 'index').exists():
                    continue
                with (month / 'index').open('r') as _f:
-                    archived_uuids = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
-                self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
+                    archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
+                self.redis.hmset('lookup_dirs_archived', archived_uuids)  # type: ignore


 def main():
--- a/bin/background_processing.py
+++ b/bin/background_processing.py
@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+import logging
+from collections import Counter
+from datetime import timedelta, date
+from typing import Dict, Any
+
+from redis import Redis
+from werkzeug.useragents import UserAgent
+
+from lookyloo.abstractmanager import AbstractManager
+from lookyloo.helpers import (get_config, get_homedir, get_socket_path,
+                              safe_create_dir)
+
+logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
+                    level=logging.INFO, datefmt='%I:%M:%S')
+
+
+class Processing(AbstractManager):
+
+    def __init__(self, loglevel: int=logging.INFO):
+        super().__init__(loglevel)
+        self.script_name = 'archiver'
+
+        self.use_own_ua = get_config('generic', 'use_user_agents_users')
+
+    def _to_run_forever(self):
+        if self.use_own_ua:
+            self._build_ua_file()
+
+    def _build_ua_file(self):
+        '''Build a file in a format compatible with the capture page'''
+        yesterday = (date.today() - timedelta(days=1))
+        self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
+        safe_create_dir(self_generated_ua_file_path)
+        self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
+        if self_generated_ua_file.exists():
+            return
+        redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
+        entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
+        if not entries:
+            return
+
+        to_store: Dict[str, Any] = {'by_frequency': []}
+        uas = Counter([entry.split('|', 1)[1] for entry in entries])
+        for ua, _ in uas.most_common():
+            parsed_ua = UserAgent(ua)
+            if not parsed_ua.platform or not parsed_ua.browser:
+                continue
+            if parsed_ua.platform not in to_store:
+                to_store[parsed_ua.platform] = {}
+            if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]:
+                to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = []
+            to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string)
+            to_store['by_frequency'].append({'os': parsed_ua.platform,
+                                             'browser': f'{parsed_ua.browser} {parsed_ua.version}',
+                                             'useragent': parsed_ua.string})
+        with self_generated_ua_file.open('w') as f:
+            json.dump(to_store, f, indent=2)
+
+        # Remove the UA / IP mapping.
+        redis.delete(f'user_agents|{yesterday.isoformat()}')
+
+
+def main():
+    p = Processing()
+    p.run(sleep_in_sec=3600 * 24)
+
+
+if __name__ == '__main__':
+    main()
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -103,7 +103,7 @@ Run the following command (assuming you run the code from the clonned repository


@lru_cache(64)
-def get_capture_dir() -> Path:
+def get_captures_dir() -> Path:
    capture_dir = get_homedir() / 'scraped'
    safe_create_dir(capture_dir)
    return capture_dir
@ -365,13 +365,15 @@ def get_useragent_for_requests():


 def reload_uuids_index() -> None:
-    recent_uuids = {}
-    for uuid_path in sorted(get_capture_dir().glob('*/uuid'), reverse=True):
+    recent_uuids: Dict[str, str] = {}
+    for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
        with uuid_path.open() as f:
            uuid = f.read()
        recent_uuids[uuid] = str(uuid_path.parent)
+    if not recent_uuids:
+        return None
    r = Redis(unix_socket_path=get_socket_path('cache'))
    p = r.pipeline()
    p.delete('lookup_dirs')
-    p.hset('lookup_dirs', mapping=recent_uuids)
+    p.hset('lookup_dirs', mapping=recent_uuids)  # type: ignore
    p.execute()
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -3,8 +3,8 @@

 import os
 import base64
-from collections import defaultdict, Counter
-from datetime import datetime, date, timedelta
+from collections import defaultdict
+from datetime import datetime, date
 from email.message import EmailMessage
 from io import BufferedIOBase, BytesIO
 import ipaddress
@ -106,37 +106,32 @@ class Lookyloo():
        today = date.today().isoformat()
        self.redis.zincrby(f'user_agents|{today}', 1, f'{remote_ip}|{user_agent}')

-    def build_ua_file(self) -> None:
-        '''Build a file in a format compatible with the capture page'''
-        yesterday = (date.today() - timedelta(days=1))
-        self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
-        safe_create_dir(self_generated_ua_file_path)
-        self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
-        if self_generated_ua_file.exists():
-            return
-        entries = self.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
-        if not entries:
-            return
-
-        to_store: Dict[str, Any] = {'by_frequency': []}
-        uas = Counter([entry.split('|', 1)[1] for entry in entries])
-        for ua, _ in uas.most_common():
-            parsed_ua = UserAgent(ua)
-            if not parsed_ua.platform or not parsed_ua.browser:
-                continue
-            if parsed_ua.platform not in to_store:
-                to_store[parsed_ua.platform] = {}
-            if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]:
-                to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = []
-            to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string)
-            to_store['by_frequency'].append({'os': parsed_ua.platform,
-                                             'browser': f'{parsed_ua.browser} {parsed_ua.version}',
-                                             'useragent': parsed_ua.string})
-        with self_generated_ua_file.open('w') as f:
-            json.dump(to_store, f, indent=2)
-
-        # Remove the UA / IP mapping.
-        self.redis.delete(f'user_agents|{yesterday.isoformat()}')
+    def _get_capture_dir(self, capture_uuid: str, /) -> Path:
+        '''Use the cache to get a capture directory from a capture UUID'''
+        capture_dir: Optional[Union[str, Path]]
+        if capture_uuid in self._captures_index:
+            capture_dir = self._captures_index[capture_uuid].capture_dir
+            if capture_dir.exists():
+                return capture_dir
+            self.redis.delete(capture_dir)
+            self._captures_index.pop(capture_uuid)
+        capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
+        if capture_dir and not Path(capture_dir).exists():
+            # The capture was either removed or archived, cleaning up
+            self.redis.hdel('lookup_dirs', capture_uuid)
+            capture_dir = None
+        if not capture_dir:
+            # Try in the archive
+            capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
+            if not capture_dir:
+                raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
+        to_return = Path(capture_dir)
+        if not to_return.exists():
+            # The capture was removed, remove the UUID
+            self.redis.hdel('lookup_dirs_archived', capture_uuid)
+            self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
+            raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
+        return to_return

    def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
        '''Generate the pickle, set the cache, add capture in the indexes'''
@ -197,6 +192,85 @@ class Lookyloo():
            lock_file.unlink(missing_ok=True)
        return ct

+    def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
+        '''Populate the redis cache for a capture. Mostly used on the index page.
+        NOTE: Doesn't require the pickle.'''
+        with (capture_dir / 'uuid').open() as f:
+            uuid = f.read().strip()
+
+        har_files = sorted(capture_dir.glob('*.har'))
+
+        error_cache: Dict[str, str] = {}
+        if (capture_dir / 'error.txt').exists():
+            # Something went wrong
+            with (capture_dir / 'error.txt').open() as _error:
+                content = _error.read()
+                try:
+                    error_to_cache = json.loads(content)
+                    if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
+                        error_to_cache = error_to_cache.get('details')
+                except json.decoder.JSONDecodeError:
+                    # old format
+                    error_to_cache = content
+                error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
+
+        fatal_error = False
+        if har_files:
+            try:
+                har = HarFile(har_files[0], uuid)
+            except Har2TreeError as e:
+                error_cache['error'] = str(e)
+                fatal_error = True
+        else:
+            error_cache['error'] = f'No har files in {capture_dir.name}'
+            fatal_error = True
+
+        if (capture_dir / 'categories').exists():
+            with (capture_dir / 'categories').open() as _categories:
+                categories = [c.strip() for c in _categories.readlines()]
+        else:
+            categories = []
+
+        p = self.redis.pipeline()
+        p.hset('lookup_dirs', uuid, str(capture_dir))
+        if error_cache:
+            if 'HTTP Error' not in error_cache['error']:
+                self.logger.warning(error_cache['error'])
+            p.hmset(str(capture_dir), error_cache)
+
+        if not fatal_error:
+            redirects = har.initial_redirects
+            incomplete_redirects = False
+            if redirects and har.need_tree_redirects:
+                # load tree from disk, get redirects
+                ct = load_pickle_tree(capture_dir)
+                if ct:
+                    redirects = ct.redirects
+                else:
+                    # Pickle not available
+                    incomplete_redirects = True
+
+            cache: Dict[str, Union[str, int]] = {'uuid': uuid,
+                                                 'title': har.initial_title,
+                                                 'timestamp': har.initial_start_time,
+                                                 'url': har.root_url,
+                                                 'redirects': json.dumps(redirects),
+                                                 'categories': json.dumps(categories),
+                                                 'capture_dir': str(capture_dir),
+                                                 'incomplete_redirects': 1 if incomplete_redirects else 0}
+            if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
+                cache['no_index'] = 1
+
+            if (capture_dir / 'parent').exists():  # The capture was initiated from an other one
+                with (capture_dir / 'parent').open() as f:
+                    cache['parent'] = f.read().strip()
+
+            p.hmset(str(capture_dir), cache)
+        p.execute()
+        # If the cache is re-created for some reason, pop from the local cache.
+        self._captures_index.pop(uuid, None)
+        return cache
+
    def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
        '''Returns a list of CNAMEs starting from one hostname.
        The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
@ -259,17 +333,6 @@ class Lookyloo():
            json.dump(host_ips, f)
        return ct

-    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
-        '''Get the generated tree in ETE Toolkit format.
-        Loads the pickle if it exists, creates it otherwise.'''
-        capture_dir = self._get_capture_dir(capture_uuid)
-        ct = load_pickle_tree(capture_dir)
-        if not ct:
-            ct = self._cache_capture(capture_uuid)
-        if not ct:
-            raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
-        return ct
-
    def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
                    legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
        '''Adds context information to a capture or a URL node'''
@ -449,85 +512,6 @@ class Lookyloo():
                to_return[event_id].update(values)
        return to_return

-    def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
-        '''Populate the redis cache for a capture. Mostly used on the index page.
-        NOTE: Doesn't require the pickle.'''
-        with (capture_dir / 'uuid').open() as f:
-            uuid = f.read().strip()
-
-        har_files = sorted(capture_dir.glob('*.har'))
-
-        error_cache: Dict[str, str] = {}
-        if (capture_dir / 'error.txt').exists():
-            # Something went wrong
-            with (capture_dir / 'error.txt').open() as _error:
-                content = _error.read()
-                try:
-                    error_to_cache = json.loads(content)
-                    if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
-                        error_to_cache = error_to_cache.get('details')
-                except json.decoder.JSONDecodeError:
-                    # old format
-                    error_to_cache = content
-                error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
-
-        fatal_error = False
-        if har_files:
-            try:
-                har = HarFile(har_files[0], uuid)
-            except Har2TreeError as e:
-                error_cache['error'] = str(e)
-                fatal_error = True
-        else:
-            error_cache['error'] = f'No har files in {capture_dir.name}'
-            fatal_error = True
-
-        if (capture_dir / 'categories').exists():
-            with (capture_dir / 'categories').open() as _categories:
-                categories = [c.strip() for c in _categories.readlines()]
-        else:
-            categories = []
-
-        p = self.redis.pipeline()
-        p.hset('lookup_dirs', uuid, str(capture_dir))
-        if error_cache:
-            if 'HTTP Error' not in error_cache['error']:
-                self.logger.warning(error_cache['error'])
-            p.hmset(str(capture_dir), error_cache)
-
-        if not fatal_error:
-            redirects = har.initial_redirects
-            incomplete_redirects = False
-            if redirects and har.need_tree_redirects:
-                # load tree from disk, get redirects
-                ct = load_pickle_tree(capture_dir)
-                if ct:
-                    redirects = ct.redirects
-                else:
-                    # Pickle not available
-                    incomplete_redirects = True
-
-            cache: Dict[str, Union[str, int]] = {'uuid': uuid,
-                                                 'title': har.initial_title,
-                                                 'timestamp': har.initial_start_time,
-                                                 'url': har.root_url,
-                                                 'redirects': json.dumps(redirects),
-                                                 'categories': json.dumps(categories),
-                                                 'capture_dir': str(capture_dir),
-                                                 'incomplete_redirects': 1 if incomplete_redirects else 0}
-            if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
-                cache['no_index'] = 1
-
-            if (capture_dir / 'parent').exists():  # The capture was initiated from an other one
-                with (capture_dir / 'parent').open() as f:
-                    cache['parent'] = f.read().strip()
-
-            p.hmset(str(capture_dir), cache)
-        p.execute()
-        # If the cache is re-created for some reason, pop from the local cache.
-        self._captures_index.pop(uuid, None)
-        return cache
-
    def hide_capture(self, capture_uuid: str, /) -> None:
        """Add the capture in the hidden pool (not shown on the front page)
        NOTE: it won't remove the correlations until they are rebuilt.
@ -599,32 +583,16 @@ class Lookyloo():
            self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
            return None

-    def _get_capture_dir(self, capture_uuid: str, /) -> Path:
-        '''Use the cache to get a capture directory from a capture UUID'''
-        capture_dir: Optional[Union[str, Path]]
-        if capture_uuid in self._captures_index:
-            capture_dir = self._captures_index[capture_uuid].capture_dir
-            if capture_dir.exists():
-                return capture_dir
-            self.redis.delete(capture_dir)
-            self._captures_index.pop(capture_uuid)
-        capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
-        if capture_dir and not Path(capture_dir).exists():
-            # The capture was either removed or archived, cleaning up
-            self.redis.hdel('lookup_dirs', capture_uuid)
-            capture_dir = None
-        if not capture_dir:
-            # Try in the archive
-            capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
-            if not capture_dir:
-                raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
-        to_return = Path(capture_dir)
-        if not to_return.exists():
-            # The capture was removed, remove the UUID
-            self.redis.hdel('lookup_dirs_archived', capture_uuid)
-            self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
-            raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
-        return to_return
+    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
+        '''Get the generated tree in ETE Toolkit format.
+        Loads the pickle if it exists, creates it otherwise.'''
+        capture_dir = self._get_capture_dir(capture_uuid)
+        ct = load_pickle_tree(capture_dir)
+        if not ct:
+            ct = self._cache_capture(capture_uuid)
+        if not ct:
+            raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
+        return ct

    def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
        redis = self.redis  # use a single connection
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,6 +32,7 @@ rebuild_caches = "bin.rebuild_caches:main"
 update = "bin.update:main"
 background_indexer = "bin.background_indexer:main"
 archiver = "bin.archiver:main"
+processing = "bin.background_processing:main"


 [tool.poetry.dependencies]
--- a/website/web/init.py
+++ b/website/web/init.py
@ -21,7 +21,7 @@ from werkzeug.security import check_password_hash

 from pymisp import MISPEvent, MISPServerError

-from lookyloo.helpers import (update_user_agents, get_user_agents, get_config,
+from lookyloo.helpers import (get_user_agents, get_config,
                              get_taxonomies, load_cookies, CaptureStatus)
 from lookyloo.lookyloo import Lookyloo, Indexing
 from lookyloo.exceptions import NoValidHarFile, MissingUUID
@ -679,10 +679,6 @@ def index():
    if request.method == 'HEAD':
        # Just returns ack if the webserver is running
        return 'Ack'
-    if use_own_ua:
-        lookyloo.build_ua_file()
-    else:
-        update_user_agents()
    show_error, category = get_index_params(request)
    return index_generic(show_error=show_error)