chg: Improve tree creation and cache

2021-09-22 17:09:04 +02:00 · 2021-09-22 17:09:04 +02:00 · 32ee474be2
parent 01fff00cad
commit 32ee474be2
5 changed files with 326 additions and 326 deletions
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@ -91,6 +91,8 @@ class BackgroundIndexer(AbstractManager):
            if not indexed[2]:
                self.logger.info(f'Indexing cookies for {cache.uuid}')
                self.lookyloo.indexing.index_cookies_capture(ct)
+            # NOTE: categories aren't taken in account here, should be fixed(?)
+            # see indexing.index_categories_capture(capture_uuid, categories)


 def main():
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@ -2,11 +2,26 @@
 # -*- coding: utf-8 -*-

 import json
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+import logging
+import pickle
+import sys
+import time

-from .exceptions import LookylooException, MissingCaptureDirectory
+from collections.abc import Mapping
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import dns.rdatatype
+import dns.resolver
+from har2tree import CrawledTree, Har2TreeError, HarFile
+from redis import Redis
+
+from .context import Context
+from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile,
+                         MissingUUID, TreeNeedsRebuild)
+from .helpers import try_make_file, get_config


 class CaptureCache():
@ -35,3 +50,287 @@ class CaptureCache():
        self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
        self.categories: List[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else []
        self.parent: Optional[str] = cache_entry.get('parent')
+
+    @property
+    def tree(self) -> CrawledTree:
+        try:
+            return load_pickle_tree(self.capture_dir)
+        except TreeNeedsRebuild:
+            # The pickle is missing, that shouldn't at this stage.
+            raise LookylooException(f'Unable to get pickle for {self.uuid} - {self.capture_dir}')
+
+
+def remove_pickle_tree(capture_dir: Path) -> None:
+    pickle_file = capture_dir / 'tree.pickle'
+    if pickle_file.exists():
+        pickle_file.unlink()
+
+
+@lru_cache(maxsize=1024)
+def load_pickle_tree(capture_dir: Path) -> CrawledTree:
+    pickle_file = capture_dir / 'tree.pickle'
+    if pickle_file.exists():
+        with pickle_file.open('rb') as _p:
+            try:
+                return pickle.load(_p)
+            except pickle.UnpicklingError:
+                remove_pickle_tree(capture_dir)
+            except EOFError:
+                remove_pickle_tree(capture_dir)
+            except Exception:
+                remove_pickle_tree(capture_dir)
+    raise TreeNeedsRebuild()
+
+
+class CapturesIndex(Mapping):
+
+    def __init__(self, redis: Redis, contextualizer: Optional[Context]=None):
+        self.logger = logging.getLogger(f'{self.__class__.__name__}')
+        self.logger.setLevel(get_config('generic', 'loglevel'))
+        self.redis = redis
+        self.contextualizer = contextualizer
+        self.__cache: Dict[str, CaptureCache] = {}
+
+    def __getitem__(self, uuid: str) -> CaptureCache:
+        if uuid in self.__cache:
+            if (self.__cache[uuid].capture_dir.exists()
+                    and not self.__cache[uuid].incomplete_redirects):
+                return self.__cache[uuid]
+            del self.__cache[uuid]
+        capture_dir = self._get_capture_dir(uuid)
+        cached = self.redis.hgetall(str(capture_dir))
+        if cached:
+            cc = CaptureCache(cached)
+            # NOTE: checking for pickle to exist may be a bad idea here.
+            if (cc.capture_dir.exists()
+                    and (cc.capture_dir / 'tree.pickle').exists()
+                    and not cc.incomplete_redirects):
+                self.__cache[uuid] = cc
+                return self.__cache[uuid]
+        try:
+            tree = load_pickle_tree(capture_dir)
+        except TreeNeedsRebuild:
+            tree = self._create_pickle(capture_dir)
+        self.__cache[uuid] = self._set_capture_cache(capture_dir, tree)
+        return self.__cache[uuid]
+
+    def __iter__(self):
+        return iter(self.__cache)
+
+    def __len__(self):
+        return len(self.__cache)
+
+    def reload_cache(self, uuid: str) -> None:
+        if uuid in self.__cache:
+            del self.__cache[uuid]
+
+    def remove_pickle(self, uuid: str) -> None:
+        if uuid in self.__cache:
+            remove_pickle_tree(self.__cache[uuid].capture_dir)
+            del self.__cache[uuid]
+
+    def rebuild_all(self) -> None:
+        for uuid, cache in self.__cache.items():
+            remove_pickle_tree(cache.capture_dir)
+        self.redis.flushdb()
+        self.__cache = {}
+
+    def _get_capture_dir(self, uuid: str) -> Path:
+        # Try to get from the recent captures cache in redis
+        capture_dir = self.redis.hget('lookup_dirs', uuid)
+        if capture_dir:
+            to_return = Path(capture_dir)
+            if to_return.exists():
+                return to_return
+            # The capture was either removed or archived, cleaning up
+            self.redis.hdel('lookup_dirs', uuid)
+            self.redis.delete(capture_dir)
+
+        # Try to get from the archived captures cache in redis
+        capture_dir = self.redis.hget('lookup_dirs_archived', uuid)
+        if capture_dir:
+            to_return = Path(capture_dir)
+            if to_return.exists():
+                return to_return
+            self.redis.hdel('lookup_dirs_archived', uuid)
+            # The capture was removed, remove the UUID
+            self.logger.warning(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
+            raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
+
+        raise MissingUUID(f'Unable to find UUID {uuid}.')
+
+    def _create_pickle(self, capture_dir: Path) -> CrawledTree:
+        with (capture_dir / 'uuid').open() as f:
+            uuid = f.read().strip()
+
+        lock_file = capture_dir / 'lock'
+        if try_make_file(lock_file):
+            # Lock created, we can process
+            with lock_file.open('w') as f:
+                f.write(datetime.now().isoformat())
+        else:
+            # The pickle is being created somewhere else, wait until it's done.
+            while lock_file.exists():
+                time.sleep(5)
+            return load_pickle_tree(capture_dir)
+
+        har_files = sorted(capture_dir.glob('*.har'))
+        pickle_file = capture_dir / 'tree.pickle'
+        try:
+            tree = CrawledTree(har_files, uuid)
+            self.__resolve_dns(tree)
+            if self.contextualizer:
+                self.contextualizer.contextualize_tree(tree)
+        except Har2TreeError as e:
+            raise NoValidHarFile(e)
+        except RecursionError as e:
+            raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
+        else:
+            with pickle_file.open('wb') as _p:
+                # Some pickles require a pretty high recursion limit, this kindof fixes it.
+                # If the capture is really broken (generally a refresh to self), the capture
+                # is discarded in the RecursionError above.
+                default_recursion_limit = sys.getrecursionlimit()
+                sys.setrecursionlimit(int(default_recursion_limit * 1.1))
+                try:
+                    pickle.dump(tree, _p)
+                except RecursionError as e:
+                    raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
+                sys.setrecursionlimit(default_recursion_limit)
+        finally:
+            lock_file.unlink(missing_ok=True)
+        return tree
+
+    def _set_capture_cache(self, capture_dir: Path, tree: Optional[CrawledTree]=None) -> CaptureCache:
+        '''Populate the redis cache for a capture. Mostly used on the index page.
+        NOTE: Doesn't require the pickle.'''
+        with (capture_dir / 'uuid').open() as f:
+            uuid = f.read().strip()
+
+        cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': str(capture_dir)}
+        if (capture_dir / 'error.txt').exists():
+            # Something went wrong
+            with (capture_dir / 'error.txt').open() as _error:
+                content = _error.read()
+                try:
+                    error_to_cache = json.loads(content)
+                    if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
+                        error_to_cache = error_to_cache.get('details')
+                except json.decoder.JSONDecodeError:
+                    # old format
+                    error_to_cache = content
+                cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
+
+        if (har_files := sorted(capture_dir.glob('*.har'))):
+            try:
+                har = HarFile(har_files[0], uuid)
+                cache['title'] = har.initial_title
+                cache['timestamp'] = har.initial_start_time
+                cache['url'] = har.root_url
+                if har.initial_redirects and har.need_tree_redirects:
+                    if not tree:
+                        # try to load tree from disk
+                        tree = load_pickle_tree(capture_dir)
+                    # get redirects
+                    if tree:
+                        cache['redirects'] = json.dumps(tree.redirects)
+                        cache['incomplete_redirects'] = 0
+                    else:
+                        # Pickle not available
+                        cache['redirects'] = json.dumps(har.initial_redirects)
+                        cache['incomplete_redirects'] = 1
+                else:
+                    cache['redirects'] = json.dumps(har.initial_redirects)
+                    cache['incomplete_redirects'] = 0
+
+            except Har2TreeError as e:
+                cache['error'] = str(e)
+        else:
+            cache['error'] = f'No har files in {capture_dir.name}'
+
+        if (cache.get('error')
+                and isinstance(cache['error'], str)
+                and 'HTTP Error' not in cache['error']):
+            self.logger.warning(cache['error'])
+
+        if (capture_dir / 'categories').exists():
+            with (capture_dir / 'categories').open() as _categories:
+                cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
+
+        if (capture_dir / 'no_index').exists():
+            # If the folders claims anonymity
+            cache['no_index'] = 1
+
+        if (capture_dir / 'parent').exists():
+            # The capture was initiated from an other one
+            with (capture_dir / 'parent').open() as f:
+                cache['parent'] = f.read().strip()
+
+        p = self.redis.pipeline()
+        p.hset('lookup_dirs', uuid, str(capture_dir))
+        p.hmset(str(capture_dir), cache)  # type: ignore
+        p.execute()
+        return CaptureCache(cache)
+
+    def __resolve_dns(self, ct: CrawledTree):
+        '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
+        and store them in ips.json and cnames.json, in the capture directory.
+        Updates the nodes of the tree accordingly so the information is available.
+        '''
+
+        def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
+            '''Returns a list of CNAMEs starting from one hostname.
+            The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
+            and the CNAME entry can have an other CNAME entry, and so on multiple times.
+            This method loops over the hostnames until there are no CNAMES.'''
+            cnames: List[str] = []
+            to_search = hostname
+            while True:
+                if known_cnames.get(to_search) is None:
+                    break
+                # At this point, known_cnames[to_search] must exist and be a str
+                cnames.append(known_cnames[to_search])  # type: ignore
+                to_search = known_cnames[to_search]
+            return cnames
+
+        cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
+        ips_path = ct.root_hartree.har.path.parent / 'ips.json'
+        host_cnames: Dict[str, Optional[str]] = {}
+        if cnames_path.exists():
+            with cnames_path.open() as f:
+                host_cnames = json.load(f)
+
+        host_ips: Dict[str, List[str]] = {}
+        if ips_path.exists():
+            with ips_path.open() as f:
+                host_ips = json.load(f)
+
+        for node in ct.root_hartree.hostname_tree.traverse():
+            if node.name not in host_cnames or node.name not in host_ips:
+                # Resolve and cache
+                try:
+                    response = dns.resolver.resolve(node.name, search=True)
+                    for answer in response.response.answer:
+                        if answer.rdtype == dns.rdatatype.RdataType.CNAME:
+                            host_cnames[str(answer.name).rstrip('.')] = str(answer[0].target).rstrip('.')
+                        else:
+                            host_cnames[str(answer.name).rstrip('.')] = None
+
+                        if answer.rdtype in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]:
+                            host_ips[str(answer.name).rstrip('.')] = list(set(str(b) for b in answer))
+                except Exception:
+                    host_cnames[node.name] = None
+                    host_ips[node.name] = []
+            cnames = _build_cname_chain(host_cnames, node.name)
+            if cnames:
+                node.add_feature('cname', cnames)
+                if cnames[-1] in host_ips:
+                    node.add_feature('resolved_ips', host_ips[cnames[-1]])
+            elif node.name in host_ips:
+                node.add_feature('resolved_ips', host_ips[node.name])
+
+        with cnames_path.open('w') as f:
+            json.dump(host_cnames, f)
+        with ips_path.open('w') as f:
+            json.dump(host_ips, f)
+        return ct
--- a/lookyloo/exceptions.py
+++ b/lookyloo/exceptions.py
@ -28,3 +28,7 @@ class MissingUUID(LookylooException):

 class MissingCaptureDirectory(LookylooException):
    pass
+
+
+class TreeNeedsRebuild(LookylooException):
+    pass
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -3,7 +3,6 @@
 import json
 import logging
 import os
-import pickle
 from datetime import datetime, timedelta
 from enum import IntEnum, unique
 from functools import lru_cache
@ -216,28 +215,6 @@ def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) -
    return to_return


-def load_pickle_tree(capture_dir: Path) -> Optional[CrawledTree]:
-    pickle_file = capture_dir / 'tree.pickle'
-    if pickle_file.exists():
-        with pickle_file.open('rb') as _p:
-            try:
-                return pickle.load(_p)
-            except pickle.UnpicklingError:
-                remove_pickle_tree(capture_dir)
-            except EOFError:
-                remove_pickle_tree(capture_dir)
-            except Exception:
-                remove_pickle_tree(capture_dir)
-
-    return None
-
-
-def remove_pickle_tree(capture_dir: Path) -> None:
-    pickle_file = capture_dir / 'tree.pickle'
-    if pickle_file.exists():
-        pickle_file.unlink()
-
-
 def uniq_domains(uniq_urls):
    domains = set()
    for url in uniq_urls:
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -5,10 +5,7 @@ import base64
 import json
 import logging
 import operator
-import pickle
 import smtplib
-import sys
-import time
 from collections import defaultdict
 from datetime import date, datetime
 from email.message import EmailMessage
@ -19,9 +16,7 @@ from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Set,
 from uuid import uuid4
 from zipfile import ZipFile

-import dns.rdatatype
-import dns.resolver
-from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
+from har2tree import CrawledTree, Har2TreeError, HostNode, URLNode
 from PIL import Image  # type: ignore
 from pymisp import MISPAttribute, MISPEvent, MISPObject
 from pymisp.tools import FileObject, URLObject
@ -29,15 +24,13 @@ from redis import ConnectionPool, Redis
 from redis.connection import UnixDomainSocketConnection
 from werkzeug.useragents import UserAgent

-from .capturecache import CaptureCache
+from .capturecache import CaptureCache, CapturesIndex
 from .context import Context
 from .exceptions import (LookylooException, MissingCaptureDirectory,
-                         MissingUUID, NoValidHarFile)
+                         MissingUUID)
 from .helpers import (CaptureStatus, get_captures_dir, get_config,
                      get_email_template, get_homedir, get_resources_hashes,
-                      get_socket_path, get_splash_url, get_taxonomies,
-                      load_pickle_tree, remove_pickle_tree, try_make_file,
-                      uniq_domains)
+                      get_socket_path, get_splash_url, get_taxonomies, uniq_domains)
 from .indexing import Indexing
 from .modules import (MISP, PhishingInitiative, UniversalWhois,
                      UrlScan, VirusTotal, Phishtank)
@ -86,7 +79,7 @@ class Lookyloo():
            self.logger.warning('Unable to setup the Phishtank module')

        self.context = Context()
-        self._captures_index: Dict[str, CaptureCache] = {}
+        self._captures_index = CapturesIndex(self.redis, self.context)

    @property
    def redis(self):
@ -94,233 +87,7 @@ class Lookyloo():

    def _get_capture_dir(self, capture_uuid: str, /) -> Path:
        '''Use the cache to get a capture directory from a capture UUID'''
-        capture_dir: Optional[str]
-        to_return: Path
-
-        # Try to get from the in-class cache
-        if capture_uuid in self._captures_index:
-            to_return = self._captures_index[capture_uuid].capture_dir
-            if to_return.exists():
-                return to_return
-            self.redis.delete(str(to_return))
-            self._captures_index.pop(capture_uuid)
-
-        # Try to get from the recent captures cache in redis
-        capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
-        if capture_dir:
-            to_return = Path(capture_dir)
-            if to_return.exists():
-                return to_return
-            # The capture was either removed or archived, cleaning up
-            self.redis.hdel('lookup_dirs', capture_uuid)
-            self.redis.delete(capture_dir)
-
-        # Try to get from the archived captures cache in redis
-        capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
-        if capture_dir:
-            to_return = Path(capture_dir)
-            if to_return.exists():
-                return to_return
-            self.redis.hdel('lookup_dirs_archived', capture_uuid)
-            # The capture was removed, remove the UUID
-            self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}).')
-            raise MissingCaptureDirectory(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}).')
-
-        raise MissingUUID(f'Unable to find UUID {capture_uuid}.')
-
-    def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
-        '''Generate the pickle, set the cache, add capture in the indexes'''
-
-        capture_dir = self._get_capture_dir(capture_uuid)
-
-        har_files = sorted(capture_dir.glob('*.har'))
-        lock_file = capture_dir / 'lock'
-        pickle_file = capture_dir / 'tree.pickle'
-
-        if try_make_file(lock_file):
-            # Lock created, we can process
-            with lock_file.open('w') as f:
-                f.write(datetime.now().isoformat())
-        else:
-            # The pickle is being created somewhere else, wait until it's done.
-            while lock_file.exists():
-                time.sleep(5)
-            keep_going = 5
-            while (ct := load_pickle_tree(capture_dir)) is None:
-                keep_going -= 1
-                if not keep_going:
-                    raise LookylooException(f'Unable to get tree for {capture_uuid}')
-                time.sleep(5)
-            return ct
-
-        # NOTE: We only index the public captures
-        index = True
-        try:
-            ct = CrawledTree(har_files, capture_uuid)
-            self._resolve_dns(ct)
-            self.context.contextualize_tree(ct)
-            cache = self.capture_cache(capture_uuid)
-            if not cache:
-                raise LookylooException(f'Broken cache for {capture_dir}')
-            if self.is_public_instance:
-                if cache.no_index:
-                    index = False
-            if index:
-                self.indexing.index_cookies_capture(ct)
-                self.indexing.index_body_hashes_capture(ct)
-                self.indexing.index_url_capture(ct)
-                categories = list(self.categories_capture(capture_uuid).keys())
-                self.indexing.index_categories_capture(capture_uuid, categories)
-        except Har2TreeError as e:
-            raise NoValidHarFile(e)
-        except RecursionError as e:
-            raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
-        else:
-            with pickle_file.open('wb') as _p:
-                # Some pickles require a pretty high recursion limit, this kindof fixes it.
-                # If the capture is really broken (generally a refresh to self), the capture
-                # is discarded in the RecursionError above.
-                default_recursion_limit = sys.getrecursionlimit()
-                sys.setrecursionlimit(int(default_recursion_limit * 1.1))
-                try:
-                    pickle.dump(ct, _p)
-                except RecursionError as e:
-                    raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
-                sys.setrecursionlimit(default_recursion_limit)
-        finally:
-            lock_file.unlink(missing_ok=True)
-        return ct
-
-    def _set_capture_cache(self, capture_dir: Path):
-        '''Populate the redis cache for a capture. Mostly used on the index page.
-        NOTE: Doesn't require the pickle.'''
-        with (capture_dir / 'uuid').open() as f:
-            uuid = f.read().strip()
-
-        cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': str(capture_dir)}
-        if (capture_dir / 'error.txt').exists():
-            # Something went wrong
-            with (capture_dir / 'error.txt').open() as _error:
-                content = _error.read()
-                try:
-                    error_to_cache = json.loads(content)
-                    if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
-                        error_to_cache = error_to_cache.get('details')
-                except json.decoder.JSONDecodeError:
-                    # old format
-                    error_to_cache = content
-                cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
-
-        if (har_files := sorted(capture_dir.glob('*.har'))):
-            try:
-                har = HarFile(har_files[0], uuid)
-                cache['title'] = har.initial_title
-                cache['timestamp'] = har.initial_start_time
-                cache['url'] = har.root_url
-                if har.initial_redirects and har.need_tree_redirects:
-                    # try to load tree from disk, get redirects
-                    if (ct := load_pickle_tree(capture_dir)):
-                        cache['redirects'] = json.dumps(ct.redirects)
-                        cache['incomplete_redirects'] = 0
-                    else:
-                        # Pickle not available
-                        cache['redirects'] = json.dumps(har.initial_redirects)
-                        cache['incomplete_redirects'] = 1
-                else:
-                    cache['redirects'] = json.dumps(har.initial_redirects)
-                    cache['incomplete_redirects'] = 0
-
-            except Har2TreeError as e:
-                cache['error'] = str(e)
-        else:
-            cache['error'] = f'No har files in {capture_dir.name}'
-
-        if (cache.get('error')
-                and isinstance(cache['error'], str)
-                and 'HTTP Error' not in cache['error']):
-            self.logger.warning(cache['error'])
-
-        if (capture_dir / 'categories').exists():
-            with (capture_dir / 'categories').open() as _categories:
-                cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
-
-        if (capture_dir / 'no_index').exists():
-            # If the folders claims anonymity
-            cache['no_index'] = 1
-
-        if (capture_dir / 'parent').exists():
-            # The capture was initiated from an other one
-            with (capture_dir / 'parent').open() as f:
-                cache['parent'] = f.read().strip()
-
-        p = self.redis.pipeline()
-        p.hset('lookup_dirs', uuid, str(capture_dir))
-        p.hmset(str(capture_dir), cache)
-        p.execute()
-        self._captures_index[uuid] = CaptureCache(cache)
-
-    def _resolve_dns(self, ct: CrawledTree):
-        '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
-        and store them in ips.json and cnames.json, in the capture directory.
-        Updates the nodes of the tree accordingly so the information is available.
-        '''
-
-        def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
-            '''Returns a list of CNAMEs starting from one hostname.
-            The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
-            and the CNAME entry can have an other CNAME entry, and so on multiple times.
-            This method loops over the hostnames until there are no CNAMES.'''
-            cnames: List[str] = []
-            to_search = hostname
-            while True:
-                if known_cnames.get(to_search) is None:
-                    break
-                # At this point, known_cnames[to_search] must exist and be a str
-                cnames.append(known_cnames[to_search])  # type: ignore
-                to_search = known_cnames[to_search]
-            return cnames
-
-        cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
-        ips_path = ct.root_hartree.har.path.parent / 'ips.json'
-        host_cnames: Dict[str, Optional[str]] = {}
-        if cnames_path.exists():
-            with cnames_path.open() as f:
-                host_cnames = json.load(f)
-
-        host_ips: Dict[str, List[str]] = {}
-        if ips_path.exists():
-            with ips_path.open() as f:
-                host_ips = json.load(f)
-
-        for node in ct.root_hartree.hostname_tree.traverse():
-            if node.name not in host_cnames or node.name not in host_ips:
-                # Resolve and cache
-                try:
-                    response = dns.resolver.resolve(node.name, search=True)
-                    for answer in response.response.answer:
-                        if answer.rdtype == dns.rdatatype.RdataType.CNAME:
-                            host_cnames[str(answer.name).rstrip('.')] = str(answer[0].target).rstrip('.')
-                        else:
-                            host_cnames[str(answer.name).rstrip('.')] = None
-
-                        if answer.rdtype in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]:
-                            host_ips[str(answer.name).rstrip('.')] = list(set(str(b) for b in answer))
-                except Exception:
-                    host_cnames[node.name] = None
-                    host_ips[node.name] = []
-            cnames = _build_cname_chain(host_cnames, node.name)
-            if cnames:
-                node.add_feature('cname', cnames)
-                if cnames[-1] in host_ips:
-                    node.add_feature('resolved_ips', host_ips[cnames[-1]])
-            elif node.name in host_ips:
-                node.add_feature('resolved_ips', host_ips[node.name])
-
-        with cnames_path.open('w') as f:
-            json.dump(host_cnames, f)
-        with ips_path.open('w') as f:
-            json.dump(host_ips, f)
-        return ct
+        return self._captures_index[capture_uuid].capture_dir

    def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
                    legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
@ -338,8 +105,7 @@ class Lookyloo():

    def remove_pickle(self, capture_uuid: str, /) -> None:
        '''Remove the pickle from a specific capture.'''
-        capture_dir = self._get_capture_dir(capture_uuid)
-        remove_pickle_tree(capture_dir)
+        self._captures_index.remove_pickle(capture_uuid)

    def rebuild_cache(self) -> None:
        '''Flush and rebuild the redis cache. Doesn't remove the pickles.
@ -349,8 +115,7 @@ class Lookyloo():
    def rebuild_all(self) -> None:
        '''Flush and rebuild the redis cache, and delete all the pickles.
        The captures will be rebuilt by the background indexer'''
-        [remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()]  # type: ignore
-        self.rebuild_cache()
+        self._captures_index.rebuild_all()

    def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
        '''Get a URL node from a tree, by UUID'''
@ -515,11 +280,9 @@ class Lookyloo():
        """Add the capture in the hidden pool (not shown on the front page)
        NOTE: it won't remove the correlations until they are rebuilt.
        """
-        capture_dir = self._get_capture_dir(capture_uuid)
-        self.redis.hset(str(capture_dir), 'no_index', 1)
-        (capture_dir / 'no_index').touch()
-        if capture_uuid in self._captures_index:
-            self._captures_index[capture_uuid].no_index = True
+        self.redis.hset(str(self._get_capture_dir(capture_uuid)), 'no_index', 1)
+        (self._get_capture_dir(capture_uuid) / 'no_index').touch()
+        self._captures_index.reload_cache(capture_uuid)

    @property
    def capture_uuids(self) -> List[str]:
@ -535,31 +298,7 @@ class Lookyloo():
            # No captures at all on the instance
            return []

-        all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids
-                                         if (uuid in self._captures_index
-                                             and not self._captures_index[uuid].incomplete_redirects)]
-
-        captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
-        if captures_to_get:
-            p = self.redis.pipeline()
-            for directory in self.redis.hmget('lookup_dirs', *captures_to_get):
-                if not directory:
-                    continue
-                p.hgetall(directory)
-            for uuid, c in zip(captures_to_get, p.execute()):
-                try:
-                    if not c:
-                        c = self.capture_cache(uuid)
-                        if not c:
-                            continue
-                    else:
-                        c = CaptureCache(c)
-                except LookylooException as e:
-                    self.logger.warning(e)
-                    continue
-                if hasattr(c, 'timestamp'):
-                    all_cache.append(c)
-                    self._captures_index[c.uuid] = c
+        all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)]
        all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
        return all_cache

@ -577,15 +316,8 @@ class Lookyloo():

    def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
        """Get the cache from redis."""
-        if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
-            return self._captures_index[capture_uuid]
        try:
-            capture_dir = self._get_capture_dir(capture_uuid)
-            cached = self.redis.hgetall(str(capture_dir))
-            if not cached or cached.get('incomplete_redirects') == '1':
-                self._set_capture_cache(capture_dir)
-            else:
-                self._captures_index[capture_uuid] = CaptureCache(cached)
+            return self._captures_index[capture_uuid]
        except MissingCaptureDirectory as e:
            # The UUID is in the captures but the directory is not on the disk.
            self.logger.warning(e)
@ -600,17 +332,11 @@ class Lookyloo():
        except Exception as e:
            self.logger.critical(e)
            return None
-        else:
-            return self._captures_index[capture_uuid]

    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
        '''Get the generated tree in ETE Toolkit format.
        Loads the pickle if it exists, creates it otherwise.'''
-        capture_dir = self._get_capture_dir(capture_uuid)
-        ct = load_pickle_tree(capture_dir)
-        if not ct:
-            ct = self._cache_capture(capture_uuid)
-        return ct
+        return self._captures_index[capture_uuid].tree

    def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
        '''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
@ -923,14 +649,6 @@ class Lookyloo():
        if not cache:
            return {'error': 'UUID missing in cache, try again later.'}

-        if cache.incomplete_redirects:
-            ct = self._cache_capture(capture_uuid)
-            cache = self.capture_cache(capture_uuid)
-            if not cache:
-                return {'error': 'UUID missing in cache, try again later.'}
-        else:
-            ct = self.get_crawled_tree(capture_uuid)
-
        event = MISPEvent()
        event.info = f'Lookyloo Capture ({cache.url})'
        lookyloo_link: MISPAttribute = event.add_attribute('link', f'https://{self.public_domain}/tree/{capture_uuid}')  # type: ignore
@ -939,7 +657,7 @@ class Lookyloo():

        initial_url = URLObject(cache.url)
        initial_url.comment = 'Submitted URL'
-        self.__misp_add_ips_to_URLObject(initial_url, ct.root_hartree.hostname_tree)
+        self.__misp_add_ips_to_URLObject(initial_url, cache.tree.root_hartree.hostname_tree)

        redirects: List[URLObject] = []
        for nb, url in enumerate(cache.redirects):
@ -947,7 +665,7 @@ class Lookyloo():
                continue
            obj = URLObject(url)
            obj.comment = f'Redirect {nb}'
-            self.__misp_add_ips_to_URLObject(obj, ct.root_hartree.hostname_tree)
+            self.__misp_add_ips_to_URLObject(obj, cache.tree.root_hartree.hostname_tree)
            redirects.append(obj)
        if redirects:
            redirects[-1].comment = f'Last redirect ({nb})'
@ -967,7 +685,7 @@ class Lookyloo():

        screenshot: MISPAttribute = event.add_attribute('attachment', 'screenshot_landing_page.png', data=self.get_screenshot(capture_uuid), disable_correlation=True)  # type: ignore
        try:
-            fo = FileObject(pseudofile=ct.root_hartree.rendered_node.body, filename=ct.root_hartree.rendered_node.filename)
+            fo = FileObject(pseudofile=cache.tree.root_hartree.rendered_node.body, filename=cache.tree.root_hartree.rendered_node.filename)
            fo.comment = 'Content received for the final redirect (before rendering)'
            fo.add_reference(final_redirect, 'loaded-by', 'URL loading that content')
            fo.add_reference(screenshot, 'rendered-as', 'Screenshot of the page')