chg: More cleanup to use the new caching system as it should be.

2021-09-27 11:36:27 +02:00 · 2021-09-27 11:36:27 +02:00 · 681e136ef4
parent d05b1edf48
commit 681e136ef4
7 changed files with 54 additions and 91 deletions
--- a/lookyloo/capturecache.py
+++ b/lookyloo/capturecache.py
@ -176,11 +176,11 @@ class CapturesIndex(Mapping):
            to_return = Path(capture_dir)
            if to_return.exists():
                return to_return
-            self.redis.hdel('lookup_dirs_archived', uuid)
            # The capture was removed, remove the UUID
+            self.redis.hdel('lookup_dirs_archived', uuid)
+            self.redis.delete(capture_dir)
            self.logger.warning(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
            raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
-
        raise MissingUUID(f'Unable to find UUID {uuid}.')

    def _create_pickle(self, capture_dir: Path) -> CrawledTree:
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -27,7 +27,7 @@ from werkzeug.useragents import UserAgent
 from .capturecache import CaptureCache, CapturesIndex
 from .context import Context
 from .exceptions import (LookylooException, MissingCaptureDirectory,
-                         MissingUUID, TreeNeedsRebuild)
+                         MissingUUID, TreeNeedsRebuild, NoValidHarFile)
 from .helpers import (CaptureStatus, get_captures_dir, get_config,
                      get_email_template, get_homedir, get_resources_hashes,
                      get_socket_path, get_splash_url, get_taxonomies, uniq_domains)
@ -85,10 +85,6 @@ class Lookyloo():
    def redis(self):
        return Redis(connection_pool=self.redis_pool)

-    def _get_capture_dir(self, capture_uuid: str, /) -> Path:
-        '''Use the cache to get a capture directory from a capture UUID'''
-        return self._captures_index[capture_uuid].capture_dir
-
    def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
                    legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
        '''Adds context information to a capture or a URL node'''
@ -142,8 +138,7 @@ class Lookyloo():

    def get_meta(self, capture_uuid: str, /) -> Dict[str, str]:
        '''Get the meta informations from a capture (mostly, details about the User Agent used.)'''
-        capture_dir = self._get_capture_dir(capture_uuid)
-        metafile = capture_dir / 'meta'
+        metafile = self._captures_index[capture_uuid].capture_dir / 'meta'
        if metafile.exists():
            with metafile.open('r') as f:
                return json.load(f)
@ -171,10 +166,10 @@ class Lookyloo():

    def categories_capture(self, capture_uuid: str, /) -> Dict[str, Any]:
        '''Get all the categories related to a capture, in MISP Taxonomies format'''
-        capture_dir = self._get_capture_dir(capture_uuid)
+        categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
        # get existing categories if possible
-        if (capture_dir / 'categories').exists():
-            with (capture_dir / 'categories').open() as f:
+        if categ_file.exists():
+            with categ_file.open() as f:
                current_categories = [line.strip() for line in f.readlines()]
            return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
        return {}
@ -186,30 +181,30 @@ class Lookyloo():
        # Make sure the category is mappable to a taxonomy.
        self.taxonomies.revert_machinetag(category)

-        capture_dir = self._get_capture_dir(capture_uuid)
+        categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
        # get existing categories if possible
-        if (capture_dir / 'categories').exists():
-            with (capture_dir / 'categories').open() as f:
+        if categ_file.exists():
+            with categ_file.open() as f:
                current_categories = set(line.strip() for line in f.readlines())
        else:
            current_categories = set()
        current_categories.add(category)
-        with (capture_dir / 'categories').open('w') as f:
+        with categ_file.open('w') as f:
            f.writelines(f'{t}\n' for t in current_categories)

    def uncategorize_capture(self, capture_uuid: str, /, category: str) -> None:
        '''Remove a category (MISP Taxonomy tag) from a capture.'''
        if not get_config('generic', 'enable_categorization'):
            return
-        capture_dir = self._get_capture_dir(capture_uuid)
+        categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
        # get existing categories if possible
-        if (capture_dir / 'categories').exists():
-            with (capture_dir / 'categories').open() as f:
+        if categ_file.exists():
+            with categ_file.open() as f:
                current_categories = set(line.strip() for line in f.readlines())
        else:
            current_categories = set()
        current_categories.remove(category)
-        with (capture_dir / 'categories').open('w') as f:
+        with categ_file.open('w') as f:
            f.writelines(f'{t}\n' for t in current_categories)

    def trigger_modules(self, capture_uuid: str, /, force: bool=False, auto_trigger: bool=False) -> Dict:
@ -283,23 +278,19 @@ class Lookyloo():
        """Add the capture in the hidden pool (not shown on the front page)
        NOTE: it won't remove the correlations until they are rebuilt.
        """
-        self.redis.hset(str(self._get_capture_dir(capture_uuid)), 'no_index', 1)
-        (self._get_capture_dir(capture_uuid) / 'no_index').touch()
+        capture_dir = self._captures_index[capture_uuid].capture_dir
+        self.redis.hset(str(capture_dir), 'no_index', 1)
+        (capture_dir / 'no_index').touch()
        self._captures_index.reload_cache(capture_uuid)

    def update_tree_cache_info(self, process_id: int, classname: str) -> None:
        self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status()))

-    @property
-    def capture_uuids(self) -> List[str]:
-        '''All the capture UUIDs present in the cache.'''
-        return self.redis.hkeys('lookup_dirs')
-
    def sorted_capture_cache(self, capture_uuids: Optional[Iterable[str]]=None) -> List[CaptureCache]:
        '''Get all the captures in the cache, sorted by timestamp (new -> old).'''
        if capture_uuids is None:
-            # Sort all captures
-            capture_uuids = self.capture_uuids
+            # Sort all recent captures
+            capture_uuids = self.redis.hkeys('lookup_dirs')
        if not capture_uuids:
            # No captures at all on the instance
            return []
@ -309,6 +300,7 @@ class Lookyloo():
        return all_cache

    def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
+        '''Returns the status (queued, ongoing, done, or UUID unknown)'''
        if self.redis.zrank('to_capture', capture_uuid) is not None:
            return CaptureStatus.QUEUED
        elif self.redis.hexists('lookup_dirs', capture_uuid):
@ -318,6 +310,7 @@ class Lookyloo():
        return CaptureStatus.UNKNOWN

    def try_error_status(self, capture_uuid: str, /) -> Optional[str]:
+        '''If it is not possible to do the capture, we store the error for a short amount of time'''
        return self.redis.get(f'error_{capture_uuid}')

    def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
@ -351,7 +344,7 @@ class Lookyloo():
    def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
        '''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''

-        def _get_priority(source: str, user: str, authenticated: bool) -> int:
+        def get_priority(source: str, user: str, authenticated: bool) -> int:
            src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
            if not authenticated:
                usr_prio = self._priority['users']['_default_anon']
@ -364,7 +357,7 @@ class Lookyloo():
                usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
            return src_prio + usr_prio

-        priority = _get_priority(source, user, authenticated)
+        priority = get_priority(source, user, authenticated)
        perma_uuid = str(uuid4())
        p = self.redis.pipeline()
        for key, value in query.items():
@ -427,7 +420,7 @@ class Lookyloo():
    def _get_raw(self, capture_uuid: str, /, extension: str='*', all_files: bool=True) -> BytesIO:
        '''Get file(s) from the capture directory'''
        try:
-            capture_dir = self._get_capture_dir(capture_uuid)
+            capture_dir = self._captures_index[capture_uuid].capture_dir
        except MissingUUID:
            return BytesIO(f'Capture {capture_uuid} not unavailable, try again later.'.encode())
        except MissingCaptureDirectory:
@ -607,6 +600,10 @@ class Lookyloo():
        except IndexError:
            # unable to find the uuid, the cache is probably in a weird state.
            return None
+        except NoValidHarFile as e:
+            # something went poorly when rebuilding the tree (probably a recursive error)
+            self.logger.warning(e)
+            return None
        if url.empty_response:
            return None
        if not h or h == url.body_hash:
@ -800,7 +797,7 @@ class Lookyloo():
    def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
        '''Gather all the informations needed to display the Hostnode investigator popup.'''

-        def _normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
+        def normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
            ''' There are a few different sources to figure out known vs. legitimate content,
            this method normalize it for the web interface.'''
            known: Optional[Union[str, List[Any]]] = None
@ -861,13 +858,13 @@ class Lookyloo():
                            if freq_embedded['hash_freq'] > 1:
                                to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
                    for h in to_append['embedded_ressources'].keys():
-                        known, legitimate = _normalize_known_content(h, known_content, url)
+                        known, legitimate = normalize_known_content(h, known_content, url)
                        if known:
                            to_append['embedded_ressources'][h]['known_content'] = known
                        elif legitimate:
                            to_append['embedded_ressources'][h]['legitimacy'] = legitimate

-                known, legitimate = _normalize_known_content(url.body_hash, known_content, url)
+                known, legitimate = normalize_known_content(url.body_hash, known_content, url)
                if known:
                    to_append['known_content'] = known
                elif legitimate:
--- a/poetry.lock
+++ b/poetry.lock
@ -277,7 +277,7 @@ python-versions = "*"

 [[package]]
 name = "filetype"
-version = "1.0.7"
+version = "1.0.8"
 description = "Infer file type and MIME type of any file/buffer. No external dependencies."
 category = "main"
 optional = false
@ -437,7 +437,7 @@ scripts = ["click (>=6.0)", "twisted (>=16.4.0)"]

 [[package]]
 name = "ipython"
-version = "7.27.0"
+version = "7.28.0"
 description = "IPython: Productive Interactive Computing"
 category = "dev"
 optional = false
@ -976,7 +976,7 @@ use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]

 [[package]]
 name = "rich"
-version = "10.10.0"
+version = "10.11.0"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
 category = "main"
 optional = false
@ -1199,7 +1199,7 @@ python-versions = "*"

 [[package]]
 name = "types-requests"
-version = "2.25.8"
+version = "2.25.9"
 description = "Typing stubs for requests"
 category = "dev"
 optional = false
@ -1223,7 +1223,7 @@ python-versions = "*"

 [[package]]
 name = "urllib3"
-version = "1.26.6"
+version = "1.26.7"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 category = "main"
 optional = false
@ -1317,7 +1317,7 @@ misp = ["python-magic", "pydeep"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "bd09b20f35c4a361a109abcf950326519706a5938c436e9ac6f2e07ef0e782c2"
+content-hash = "2b182690d3f0bb7438ded19e043d0c6aab7870e5460f4e4942a56c626d8fed66"

 [metadata.files]
 aiohttp = [
@ -1533,8 +1533,8 @@ ete3 = [
    {file = "ete3-3.1.2.tar.gz", hash = "sha256:4fc987b8c529889d6608fab1101f1455cb5cbd42722788de6aea9c7d0a8e59e9"},
 ]
 filetype = [
-    {file = "filetype-1.0.7-py2.py3-none-any.whl", hash = "sha256:353369948bb1c09b8b3ea3d78390b5586e9399bff9aab894a1dff954e31a66f6"},
-    {file = "filetype-1.0.7.tar.gz", hash = "sha256:da393ece8d98b47edf2dd5a85a2c8733e44b769e32c71af4cd96ed8d38d96aa7"},
+    {file = "filetype-1.0.8-py2.py3-none-any.whl", hash = "sha256:eb974519c5dbbd678a9fbfb5e2616105c6768ee9c01ce4a4ecaefc141e50d5e5"},
+    {file = "filetype-1.0.8.tar.gz", hash = "sha256:77df14879b73fd9711b8bd4f465dadf2ecdafff0eac3b22c0bdb0ccba68db316"},
 ]
 flask = [
    {file = "Flask-2.0.1-py3-none-any.whl", hash = "sha256:a6209ca15eb63fc9385f38e452704113d679511d9574d09b2cf9183ae7d20dc9"},
@ -1623,8 +1623,8 @@ incremental = [
    {file = "incremental-21.3.0.tar.gz", hash = "sha256:02f5de5aff48f6b9f665d99d48bfc7ec03b6e3943210de7cfc88856d755d6f57"},
 ]
 ipython = [
-    {file = "ipython-7.27.0-py3-none-any.whl", hash = "sha256:75b5e060a3417cf64f138e0bb78e58512742c57dc29db5a5058a2b1f0c10df02"},
-    {file = "ipython-7.27.0.tar.gz", hash = "sha256:58b55ebfdfa260dad10d509702dc2857cb25ad82609506b070cf2d7b7df5af13"},
+    {file = "ipython-7.28.0-py3-none-any.whl", hash = "sha256:f16148f9163e1e526f1008d7c8d966d9c15600ca20d1a754287cf96d00ba6f1d"},
+    {file = "ipython-7.28.0.tar.gz", hash = "sha256:2097be5c814d1b974aea57673176a924c4c8c9583890e7a5f082f547b9975b11"},
 ]
 itemadapter = [
    {file = "itemadapter-0.4.0-py3-none-any.whl", hash = "sha256:695809a4e2f42174f0392dd66c2ceb2b2454d3ebbf65a930e5c85910d8d88d8f"},
@ -2061,8 +2061,8 @@ requests = [
    {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
 ]
 rich = [
-    {file = "rich-10.10.0-py3-none-any.whl", hash = "sha256:0b8cbcb0b8d476a7f002feaed9f35e51615f673c6c291d76ddf0c555574fd3c7"},
-    {file = "rich-10.10.0.tar.gz", hash = "sha256:bacf58b25fea6b920446fe4e7abdc6c7664c4530c4098e7a1bc79b16b8551dfa"},
+    {file = "rich-10.11.0-py3-none-any.whl", hash = "sha256:44bb3f9553d00b3c8938abf89828df870322b9ba43caf3b12bb7758debdc6dec"},
+    {file = "rich-10.11.0.tar.gz", hash = "sha256:016fa105f34b69c434e7f908bb5bd7fefa9616efdb218a2917117683a6394ce5"},
 ]
 scrapy = [
    {file = "Scrapy-2.5.0-py2.py3-none-any.whl", hash = "sha256:5f590fdc84b496e5a4bb5ef99836b0aa688a07cfcb4bc3bb7290f66486f27424"},
@ -2139,8 +2139,8 @@ types-redis = [
    {file = "types_redis-3.5.8-py3-none-any.whl", hash = "sha256:85814769071721044857c34841e46064b867ccdd58fc81221c43462bd07e4892"},
 ]
 types-requests = [
-    {file = "types-requests-2.25.8.tar.gz", hash = "sha256:225ac2e86549b6ef3a8a44bf955f80b4955855704a15d2883d8445c8df637242"},
-    {file = "types_requests-2.25.8-py3-none-any.whl", hash = "sha256:26e90866bcd773d76b316de7e6bd6e24641f9e1653cf27241c533886600f6824"},
+    {file = "types-requests-2.25.9.tar.gz", hash = "sha256:4ec8b71da73e5344adb9bee725a74ec8598e7286f9bcb17500d627f259fe4fb9"},
+    {file = "types_requests-2.25.9-py3-none-any.whl", hash = "sha256:543ba8b3b23e38ac028da1d163aecbbc27d3cc8f654ae64339da539a191a2b1c"},
 ]
 types-werkzeug = [
    {file = "types-Werkzeug-1.0.5.tar.gz", hash = "sha256:f6216ab0e0211fe73ebdb4ae0e414113d4d8a2f783a15c2d8550e06d0fd8e7f9"},
@ -2152,8 +2152,8 @@ typing-extensions = [
    {file = "typing_extensions-3.10.0.2.tar.gz", hash = "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e"},
 ]
 urllib3 = [
-    {file = "urllib3-1.26.6-py2.py3-none-any.whl", hash = "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4"},
-    {file = "urllib3-1.26.6.tar.gz", hash = "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"},
+    {file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
+    {file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
 ]
 vt-py = [
    {file = "vt-py-0.7.4.tar.gz", hash = "sha256:27af411495a115f6cc2da5d184a9f32c12a2304eca94efefaae03d2b12b66174"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -61,7 +61,7 @@ lief = "^0.11.4"
 Flask-Login = "^0.5.0"
 flask-restx = "^0.5.1"
 hiredis = "^2.0.0"
-rich = "^10.10.0"
+rich = "^10.11.0"
 pyphishtanklookup = "^1.0.1"

 [tool.poetry.extras]
@ -69,9 +69,9 @@ misp = ['python-magic', 'pydeep']

 [tool.poetry.dev-dependencies]
 mypy = "^0.910"
-ipython = "^7.27.0"
+ipython = "^7.28.0"
 types-redis = "^3.5.8"
-types-requests = "^2.25.8"
+types-requests = "^2.25.9"
 types-Flask = "^1.1.3"
 types-pkg-resources = "^0.1.2"

--- a/tools/generate_meta_file.py
+++ b/tools/generate_meta_file.py
@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from lookyloo.lookyloo import Lookyloo
-
-lookyloo = Lookyloo()
-
-for capture_uuid in lookyloo.capture_uuids:
-    try:
-        ct = lookyloo.get_crawled_tree(capture_uuid)
-    except Exception:
-        continue
--- a/tools/rebuild_caches.py
+++ b/tools/rebuild_caches.py
@ -23,28 +23,9 @@ def main():

    indexing = Indexing()
    indexing.clear_indexes()
-    for capture_uuid in lookyloo.capture_uuids:
-        index = True
-        try:
-            tree = lookyloo.get_crawled_tree(capture_uuid)
-        except Exception as e:
-            print(capture_uuid, e)
-            continue

-        if lookyloo.is_public_instance:
-            cache = lookyloo.capture_cache(capture_uuid)
-            if not cache:
-                continue
-            if cache.no_index:
-                index = False
-
-        # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
-        if index:
-            indexing.index_cookies_capture(tree)
-            indexing.index_body_hashes_capture(tree)
-            indexing.index_url_capture(tree)
-            categories = list(lookyloo.categories_capture(capture_uuid).keys())
-            indexing.index_categories_capture(capture_uuid, categories)
+    # This call will rebuild all the caches as needed.
+    lookyloo.sorted_capture_cache()


 if __name__ == '__main__':
--- a/tools/stats.py
+++ b/tools/stats.py
@ -23,10 +23,7 @@ def uniq_domains(uniq_urls):
    return domains


-for uuid in lookyloo.capture_uuids:
-    cache = lookyloo.capture_cache(uuid)
-    if not cache or not hasattr(cache, 'timestamp'):
-        continue
+for cache in lookyloo.sorted_capture_cache():
    date = cache.timestamp
    if date.year not in stats:
        stats[date.year] = {}