mirror of https://github.com/CIRCL/lookyloo
chg: Improve tree creation and cache
parent
01fff00cad
commit
32ee474be2
|
@ -91,6 +91,8 @@ class BackgroundIndexer(AbstractManager):
|
|||
if not indexed[2]:
|
||||
self.logger.info(f'Indexing cookies for {cache.uuid}')
|
||||
self.lookyloo.indexing.index_cookies_capture(ct)
|
||||
# NOTE: categories aren't taken in account here, should be fixed(?)
|
||||
# see indexing.index_categories_capture(capture_uuid, categories)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -2,11 +2,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import logging
|
||||
import pickle
|
||||
import sys
|
||||
import time
|
||||
|
||||
from .exceptions import LookylooException, MissingCaptureDirectory
|
||||
from collections.abc import Mapping
|
||||
from datetime import datetime
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import dns.rdatatype
|
||||
import dns.resolver
|
||||
from har2tree import CrawledTree, Har2TreeError, HarFile
|
||||
from redis import Redis
|
||||
|
||||
from .context import Context
|
||||
from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile,
|
||||
MissingUUID, TreeNeedsRebuild)
|
||||
from .helpers import try_make_file, get_config
|
||||
|
||||
|
||||
class CaptureCache():
|
||||
|
@ -35,3 +50,287 @@ class CaptureCache():
|
|||
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
|
||||
self.categories: List[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else []
|
||||
self.parent: Optional[str] = cache_entry.get('parent')
|
||||
|
||||
@property
|
||||
def tree(self) -> CrawledTree:
|
||||
try:
|
||||
return load_pickle_tree(self.capture_dir)
|
||||
except TreeNeedsRebuild:
|
||||
# The pickle is missing, that shouldn't at this stage.
|
||||
raise LookylooException(f'Unable to get pickle for {self.uuid} - {self.capture_dir}')
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
pickle_file.unlink()
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def load_pickle_tree(capture_dir: Path) -> CrawledTree:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
with pickle_file.open('rb') as _p:
|
||||
try:
|
||||
return pickle.load(_p)
|
||||
except pickle.UnpicklingError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except EOFError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except Exception:
|
||||
remove_pickle_tree(capture_dir)
|
||||
raise TreeNeedsRebuild()
|
||||
|
||||
|
||||
class CapturesIndex(Mapping):
|
||||
|
||||
def __init__(self, redis: Redis, contextualizer: Optional[Context]=None):
|
||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
||||
self.redis = redis
|
||||
self.contextualizer = contextualizer
|
||||
self.__cache: Dict[str, CaptureCache] = {}
|
||||
|
||||
def __getitem__(self, uuid: str) -> CaptureCache:
|
||||
if uuid in self.__cache:
|
||||
if (self.__cache[uuid].capture_dir.exists()
|
||||
and not self.__cache[uuid].incomplete_redirects):
|
||||
return self.__cache[uuid]
|
||||
del self.__cache[uuid]
|
||||
capture_dir = self._get_capture_dir(uuid)
|
||||
cached = self.redis.hgetall(str(capture_dir))
|
||||
if cached:
|
||||
cc = CaptureCache(cached)
|
||||
# NOTE: checking for pickle to exist may be a bad idea here.
|
||||
if (cc.capture_dir.exists()
|
||||
and (cc.capture_dir / 'tree.pickle').exists()
|
||||
and not cc.incomplete_redirects):
|
||||
self.__cache[uuid] = cc
|
||||
return self.__cache[uuid]
|
||||
try:
|
||||
tree = load_pickle_tree(capture_dir)
|
||||
except TreeNeedsRebuild:
|
||||
tree = self._create_pickle(capture_dir)
|
||||
self.__cache[uuid] = self._set_capture_cache(capture_dir, tree)
|
||||
return self.__cache[uuid]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.__cache)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.__cache)
|
||||
|
||||
def reload_cache(self, uuid: str) -> None:
|
||||
if uuid in self.__cache:
|
||||
del self.__cache[uuid]
|
||||
|
||||
def remove_pickle(self, uuid: str) -> None:
|
||||
if uuid in self.__cache:
|
||||
remove_pickle_tree(self.__cache[uuid].capture_dir)
|
||||
del self.__cache[uuid]
|
||||
|
||||
def rebuild_all(self) -> None:
|
||||
for uuid, cache in self.__cache.items():
|
||||
remove_pickle_tree(cache.capture_dir)
|
||||
self.redis.flushdb()
|
||||
self.__cache = {}
|
||||
|
||||
def _get_capture_dir(self, uuid: str) -> Path:
|
||||
# Try to get from the recent captures cache in redis
|
||||
capture_dir = self.redis.hget('lookup_dirs', uuid)
|
||||
if capture_dir:
|
||||
to_return = Path(capture_dir)
|
||||
if to_return.exists():
|
||||
return to_return
|
||||
# The capture was either removed or archived, cleaning up
|
||||
self.redis.hdel('lookup_dirs', uuid)
|
||||
self.redis.delete(capture_dir)
|
||||
|
||||
# Try to get from the archived captures cache in redis
|
||||
capture_dir = self.redis.hget('lookup_dirs_archived', uuid)
|
||||
if capture_dir:
|
||||
to_return = Path(capture_dir)
|
||||
if to_return.exists():
|
||||
return to_return
|
||||
self.redis.hdel('lookup_dirs_archived', uuid)
|
||||
# The capture was removed, remove the UUID
|
||||
self.logger.warning(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
|
||||
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
|
||||
|
||||
raise MissingUUID(f'Unable to find UUID {uuid}.')
|
||||
|
||||
def _create_pickle(self, capture_dir: Path) -> CrawledTree:
|
||||
with (capture_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
|
||||
lock_file = capture_dir / 'lock'
|
||||
if try_make_file(lock_file):
|
||||
# Lock created, we can process
|
||||
with lock_file.open('w') as f:
|
||||
f.write(datetime.now().isoformat())
|
||||
else:
|
||||
# The pickle is being created somewhere else, wait until it's done.
|
||||
while lock_file.exists():
|
||||
time.sleep(5)
|
||||
return load_pickle_tree(capture_dir)
|
||||
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
try:
|
||||
tree = CrawledTree(har_files, uuid)
|
||||
self.__resolve_dns(tree)
|
||||
if self.contextualizer:
|
||||
self.contextualizer.contextualize_tree(tree)
|
||||
except Har2TreeError as e:
|
||||
raise NoValidHarFile(e)
|
||||
except RecursionError as e:
|
||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||
else:
|
||||
with pickle_file.open('wb') as _p:
|
||||
# Some pickles require a pretty high recursion limit, this kindof fixes it.
|
||||
# If the capture is really broken (generally a refresh to self), the capture
|
||||
# is discarded in the RecursionError above.
|
||||
default_recursion_limit = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(int(default_recursion_limit * 1.1))
|
||||
try:
|
||||
pickle.dump(tree, _p)
|
||||
except RecursionError as e:
|
||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||
sys.setrecursionlimit(default_recursion_limit)
|
||||
finally:
|
||||
lock_file.unlink(missing_ok=True)
|
||||
return tree
|
||||
|
||||
def _set_capture_cache(self, capture_dir: Path, tree: Optional[CrawledTree]=None) -> CaptureCache:
|
||||
'''Populate the redis cache for a capture. Mostly used on the index page.
|
||||
NOTE: Doesn't require the pickle.'''
|
||||
with (capture_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
|
||||
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': str(capture_dir)}
|
||||
if (capture_dir / 'error.txt').exists():
|
||||
# Something went wrong
|
||||
with (capture_dir / 'error.txt').open() as _error:
|
||||
content = _error.read()
|
||||
try:
|
||||
error_to_cache = json.loads(content)
|
||||
if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
|
||||
error_to_cache = error_to_cache.get('details')
|
||||
except json.decoder.JSONDecodeError:
|
||||
# old format
|
||||
error_to_cache = content
|
||||
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
||||
|
||||
if (har_files := sorted(capture_dir.glob('*.har'))):
|
||||
try:
|
||||
har = HarFile(har_files[0], uuid)
|
||||
cache['title'] = har.initial_title
|
||||
cache['timestamp'] = har.initial_start_time
|
||||
cache['url'] = har.root_url
|
||||
if har.initial_redirects and har.need_tree_redirects:
|
||||
if not tree:
|
||||
# try to load tree from disk
|
||||
tree = load_pickle_tree(capture_dir)
|
||||
# get redirects
|
||||
if tree:
|
||||
cache['redirects'] = json.dumps(tree.redirects)
|
||||
cache['incomplete_redirects'] = 0
|
||||
else:
|
||||
# Pickle not available
|
||||
cache['redirects'] = json.dumps(har.initial_redirects)
|
||||
cache['incomplete_redirects'] = 1
|
||||
else:
|
||||
cache['redirects'] = json.dumps(har.initial_redirects)
|
||||
cache['incomplete_redirects'] = 0
|
||||
|
||||
except Har2TreeError as e:
|
||||
cache['error'] = str(e)
|
||||
else:
|
||||
cache['error'] = f'No har files in {capture_dir.name}'
|
||||
|
||||
if (cache.get('error')
|
||||
and isinstance(cache['error'], str)
|
||||
and 'HTTP Error' not in cache['error']):
|
||||
self.logger.warning(cache['error'])
|
||||
|
||||
if (capture_dir / 'categories').exists():
|
||||
with (capture_dir / 'categories').open() as _categories:
|
||||
cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
|
||||
|
||||
if (capture_dir / 'no_index').exists():
|
||||
# If the folders claims anonymity
|
||||
cache['no_index'] = 1
|
||||
|
||||
if (capture_dir / 'parent').exists():
|
||||
# The capture was initiated from an other one
|
||||
with (capture_dir / 'parent').open() as f:
|
||||
cache['parent'] = f.read().strip()
|
||||
|
||||
p = self.redis.pipeline()
|
||||
p.hset('lookup_dirs', uuid, str(capture_dir))
|
||||
p.hmset(str(capture_dir), cache) # type: ignore
|
||||
p.execute()
|
||||
return CaptureCache(cache)
|
||||
|
||||
def __resolve_dns(self, ct: CrawledTree):
|
||||
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
||||
and store them in ips.json and cnames.json, in the capture directory.
|
||||
Updates the nodes of the tree accordingly so the information is available.
|
||||
'''
|
||||
|
||||
def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
|
||||
'''Returns a list of CNAMEs starting from one hostname.
|
||||
The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
|
||||
and the CNAME entry can have an other CNAME entry, and so on multiple times.
|
||||
This method loops over the hostnames until there are no CNAMES.'''
|
||||
cnames: List[str] = []
|
||||
to_search = hostname
|
||||
while True:
|
||||
if known_cnames.get(to_search) is None:
|
||||
break
|
||||
# At this point, known_cnames[to_search] must exist and be a str
|
||||
cnames.append(known_cnames[to_search]) # type: ignore
|
||||
to_search = known_cnames[to_search]
|
||||
return cnames
|
||||
|
||||
cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
|
||||
ips_path = ct.root_hartree.har.path.parent / 'ips.json'
|
||||
host_cnames: Dict[str, Optional[str]] = {}
|
||||
if cnames_path.exists():
|
||||
with cnames_path.open() as f:
|
||||
host_cnames = json.load(f)
|
||||
|
||||
host_ips: Dict[str, List[str]] = {}
|
||||
if ips_path.exists():
|
||||
with ips_path.open() as f:
|
||||
host_ips = json.load(f)
|
||||
|
||||
for node in ct.root_hartree.hostname_tree.traverse():
|
||||
if node.name not in host_cnames or node.name not in host_ips:
|
||||
# Resolve and cache
|
||||
try:
|
||||
response = dns.resolver.resolve(node.name, search=True)
|
||||
for answer in response.response.answer:
|
||||
if answer.rdtype == dns.rdatatype.RdataType.CNAME:
|
||||
host_cnames[str(answer.name).rstrip('.')] = str(answer[0].target).rstrip('.')
|
||||
else:
|
||||
host_cnames[str(answer.name).rstrip('.')] = None
|
||||
|
||||
if answer.rdtype in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]:
|
||||
host_ips[str(answer.name).rstrip('.')] = list(set(str(b) for b in answer))
|
||||
except Exception:
|
||||
host_cnames[node.name] = None
|
||||
host_ips[node.name] = []
|
||||
cnames = _build_cname_chain(host_cnames, node.name)
|
||||
if cnames:
|
||||
node.add_feature('cname', cnames)
|
||||
if cnames[-1] in host_ips:
|
||||
node.add_feature('resolved_ips', host_ips[cnames[-1]])
|
||||
elif node.name in host_ips:
|
||||
node.add_feature('resolved_ips', host_ips[node.name])
|
||||
|
||||
with cnames_path.open('w') as f:
|
||||
json.dump(host_cnames, f)
|
||||
with ips_path.open('w') as f:
|
||||
json.dump(host_ips, f)
|
||||
return ct
|
||||
|
|
|
@ -28,3 +28,7 @@ class MissingUUID(LookylooException):
|
|||
|
||||
class MissingCaptureDirectory(LookylooException):
|
||||
pass
|
||||
|
||||
|
||||
class TreeNeedsRebuild(LookylooException):
|
||||
pass
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from datetime import datetime, timedelta
|
||||
from enum import IntEnum, unique
|
||||
from functools import lru_cache
|
||||
|
@ -216,28 +215,6 @@ def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) -
|
|||
return to_return
|
||||
|
||||
|
||||
def load_pickle_tree(capture_dir: Path) -> Optional[CrawledTree]:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
with pickle_file.open('rb') as _p:
|
||||
try:
|
||||
return pickle.load(_p)
|
||||
except pickle.UnpicklingError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except EOFError:
|
||||
remove_pickle_tree(capture_dir)
|
||||
except Exception:
|
||||
remove_pickle_tree(capture_dir)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def remove_pickle_tree(capture_dir: Path) -> None:
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
if pickle_file.exists():
|
||||
pickle_file.unlink()
|
||||
|
||||
|
||||
def uniq_domains(uniq_urls):
|
||||
domains = set()
|
||||
for url in uniq_urls:
|
||||
|
|
|
@ -5,10 +5,7 @@ import base64
|
|||
import json
|
||||
import logging
|
||||
import operator
|
||||
import pickle
|
||||
import smtplib
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import date, datetime
|
||||
from email.message import EmailMessage
|
||||
|
@ -19,9 +16,7 @@ from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Set,
|
|||
from uuid import uuid4
|
||||
from zipfile import ZipFile
|
||||
|
||||
import dns.rdatatype
|
||||
import dns.resolver
|
||||
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
||||
from har2tree import CrawledTree, Har2TreeError, HostNode, URLNode
|
||||
from PIL import Image # type: ignore
|
||||
from pymisp import MISPAttribute, MISPEvent, MISPObject
|
||||
from pymisp.tools import FileObject, URLObject
|
||||
|
@ -29,15 +24,13 @@ from redis import ConnectionPool, Redis
|
|||
from redis.connection import UnixDomainSocketConnection
|
||||
from werkzeug.useragents import UserAgent
|
||||
|
||||
from .capturecache import CaptureCache
|
||||
from .capturecache import CaptureCache, CapturesIndex
|
||||
from .context import Context
|
||||
from .exceptions import (LookylooException, MissingCaptureDirectory,
|
||||
MissingUUID, NoValidHarFile)
|
||||
MissingUUID)
|
||||
from .helpers import (CaptureStatus, get_captures_dir, get_config,
|
||||
get_email_template, get_homedir, get_resources_hashes,
|
||||
get_socket_path, get_splash_url, get_taxonomies,
|
||||
load_pickle_tree, remove_pickle_tree, try_make_file,
|
||||
uniq_domains)
|
||||
get_socket_path, get_splash_url, get_taxonomies, uniq_domains)
|
||||
from .indexing import Indexing
|
||||
from .modules import (MISP, PhishingInitiative, UniversalWhois,
|
||||
UrlScan, VirusTotal, Phishtank)
|
||||
|
@ -86,7 +79,7 @@ class Lookyloo():
|
|||
self.logger.warning('Unable to setup the Phishtank module')
|
||||
|
||||
self.context = Context()
|
||||
self._captures_index: Dict[str, CaptureCache] = {}
|
||||
self._captures_index = CapturesIndex(self.redis, self.context)
|
||||
|
||||
@property
|
||||
def redis(self):
|
||||
|
@ -94,233 +87,7 @@ class Lookyloo():
|
|||
|
||||
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
||||
'''Use the cache to get a capture directory from a capture UUID'''
|
||||
capture_dir: Optional[str]
|
||||
to_return: Path
|
||||
|
||||
# Try to get from the in-class cache
|
||||
if capture_uuid in self._captures_index:
|
||||
to_return = self._captures_index[capture_uuid].capture_dir
|
||||
if to_return.exists():
|
||||
return to_return
|
||||
self.redis.delete(str(to_return))
|
||||
self._captures_index.pop(capture_uuid)
|
||||
|
||||
# Try to get from the recent captures cache in redis
|
||||
capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
|
||||
if capture_dir:
|
||||
to_return = Path(capture_dir)
|
||||
if to_return.exists():
|
||||
return to_return
|
||||
# The capture was either removed or archived, cleaning up
|
||||
self.redis.hdel('lookup_dirs', capture_uuid)
|
||||
self.redis.delete(capture_dir)
|
||||
|
||||
# Try to get from the archived captures cache in redis
|
||||
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
|
||||
if capture_dir:
|
||||
to_return = Path(capture_dir)
|
||||
if to_return.exists():
|
||||
return to_return
|
||||
self.redis.hdel('lookup_dirs_archived', capture_uuid)
|
||||
# The capture was removed, remove the UUID
|
||||
self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}).')
|
||||
raise MissingCaptureDirectory(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}).')
|
||||
|
||||
raise MissingUUID(f'Unable to find UUID {capture_uuid}.')
|
||||
|
||||
def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
|
||||
'''Generate the pickle, set the cache, add capture in the indexes'''
|
||||
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
lock_file = capture_dir / 'lock'
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
|
||||
if try_make_file(lock_file):
|
||||
# Lock created, we can process
|
||||
with lock_file.open('w') as f:
|
||||
f.write(datetime.now().isoformat())
|
||||
else:
|
||||
# The pickle is being created somewhere else, wait until it's done.
|
||||
while lock_file.exists():
|
||||
time.sleep(5)
|
||||
keep_going = 5
|
||||
while (ct := load_pickle_tree(capture_dir)) is None:
|
||||
keep_going -= 1
|
||||
if not keep_going:
|
||||
raise LookylooException(f'Unable to get tree for {capture_uuid}')
|
||||
time.sleep(5)
|
||||
return ct
|
||||
|
||||
# NOTE: We only index the public captures
|
||||
index = True
|
||||
try:
|
||||
ct = CrawledTree(har_files, capture_uuid)
|
||||
self._resolve_dns(ct)
|
||||
self.context.contextualize_tree(ct)
|
||||
cache = self.capture_cache(capture_uuid)
|
||||
if not cache:
|
||||
raise LookylooException(f'Broken cache for {capture_dir}')
|
||||
if self.is_public_instance:
|
||||
if cache.no_index:
|
||||
index = False
|
||||
if index:
|
||||
self.indexing.index_cookies_capture(ct)
|
||||
self.indexing.index_body_hashes_capture(ct)
|
||||
self.indexing.index_url_capture(ct)
|
||||
categories = list(self.categories_capture(capture_uuid).keys())
|
||||
self.indexing.index_categories_capture(capture_uuid, categories)
|
||||
except Har2TreeError as e:
|
||||
raise NoValidHarFile(e)
|
||||
except RecursionError as e:
|
||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||
else:
|
||||
with pickle_file.open('wb') as _p:
|
||||
# Some pickles require a pretty high recursion limit, this kindof fixes it.
|
||||
# If the capture is really broken (generally a refresh to self), the capture
|
||||
# is discarded in the RecursionError above.
|
||||
default_recursion_limit = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(int(default_recursion_limit * 1.1))
|
||||
try:
|
||||
pickle.dump(ct, _p)
|
||||
except RecursionError as e:
|
||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||
sys.setrecursionlimit(default_recursion_limit)
|
||||
finally:
|
||||
lock_file.unlink(missing_ok=True)
|
||||
return ct
|
||||
|
||||
def _set_capture_cache(self, capture_dir: Path):
|
||||
'''Populate the redis cache for a capture. Mostly used on the index page.
|
||||
NOTE: Doesn't require the pickle.'''
|
||||
with (capture_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
|
||||
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': str(capture_dir)}
|
||||
if (capture_dir / 'error.txt').exists():
|
||||
# Something went wrong
|
||||
with (capture_dir / 'error.txt').open() as _error:
|
||||
content = _error.read()
|
||||
try:
|
||||
error_to_cache = json.loads(content)
|
||||
if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
|
||||
error_to_cache = error_to_cache.get('details')
|
||||
except json.decoder.JSONDecodeError:
|
||||
# old format
|
||||
error_to_cache = content
|
||||
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
||||
|
||||
if (har_files := sorted(capture_dir.glob('*.har'))):
|
||||
try:
|
||||
har = HarFile(har_files[0], uuid)
|
||||
cache['title'] = har.initial_title
|
||||
cache['timestamp'] = har.initial_start_time
|
||||
cache['url'] = har.root_url
|
||||
if har.initial_redirects and har.need_tree_redirects:
|
||||
# try to load tree from disk, get redirects
|
||||
if (ct := load_pickle_tree(capture_dir)):
|
||||
cache['redirects'] = json.dumps(ct.redirects)
|
||||
cache['incomplete_redirects'] = 0
|
||||
else:
|
||||
# Pickle not available
|
||||
cache['redirects'] = json.dumps(har.initial_redirects)
|
||||
cache['incomplete_redirects'] = 1
|
||||
else:
|
||||
cache['redirects'] = json.dumps(har.initial_redirects)
|
||||
cache['incomplete_redirects'] = 0
|
||||
|
||||
except Har2TreeError as e:
|
||||
cache['error'] = str(e)
|
||||
else:
|
||||
cache['error'] = f'No har files in {capture_dir.name}'
|
||||
|
||||
if (cache.get('error')
|
||||
and isinstance(cache['error'], str)
|
||||
and 'HTTP Error' not in cache['error']):
|
||||
self.logger.warning(cache['error'])
|
||||
|
||||
if (capture_dir / 'categories').exists():
|
||||
with (capture_dir / 'categories').open() as _categories:
|
||||
cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()])
|
||||
|
||||
if (capture_dir / 'no_index').exists():
|
||||
# If the folders claims anonymity
|
||||
cache['no_index'] = 1
|
||||
|
||||
if (capture_dir / 'parent').exists():
|
||||
# The capture was initiated from an other one
|
||||
with (capture_dir / 'parent').open() as f:
|
||||
cache['parent'] = f.read().strip()
|
||||
|
||||
p = self.redis.pipeline()
|
||||
p.hset('lookup_dirs', uuid, str(capture_dir))
|
||||
p.hmset(str(capture_dir), cache)
|
||||
p.execute()
|
||||
self._captures_index[uuid] = CaptureCache(cache)
|
||||
|
||||
def _resolve_dns(self, ct: CrawledTree):
|
||||
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
||||
and store them in ips.json and cnames.json, in the capture directory.
|
||||
Updates the nodes of the tree accordingly so the information is available.
|
||||
'''
|
||||
|
||||
def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
|
||||
'''Returns a list of CNAMEs starting from one hostname.
|
||||
The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
|
||||
and the CNAME entry can have an other CNAME entry, and so on multiple times.
|
||||
This method loops over the hostnames until there are no CNAMES.'''
|
||||
cnames: List[str] = []
|
||||
to_search = hostname
|
||||
while True:
|
||||
if known_cnames.get(to_search) is None:
|
||||
break
|
||||
# At this point, known_cnames[to_search] must exist and be a str
|
||||
cnames.append(known_cnames[to_search]) # type: ignore
|
||||
to_search = known_cnames[to_search]
|
||||
return cnames
|
||||
|
||||
cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
|
||||
ips_path = ct.root_hartree.har.path.parent / 'ips.json'
|
||||
host_cnames: Dict[str, Optional[str]] = {}
|
||||
if cnames_path.exists():
|
||||
with cnames_path.open() as f:
|
||||
host_cnames = json.load(f)
|
||||
|
||||
host_ips: Dict[str, List[str]] = {}
|
||||
if ips_path.exists():
|
||||
with ips_path.open() as f:
|
||||
host_ips = json.load(f)
|
||||
|
||||
for node in ct.root_hartree.hostname_tree.traverse():
|
||||
if node.name not in host_cnames or node.name not in host_ips:
|
||||
# Resolve and cache
|
||||
try:
|
||||
response = dns.resolver.resolve(node.name, search=True)
|
||||
for answer in response.response.answer:
|
||||
if answer.rdtype == dns.rdatatype.RdataType.CNAME:
|
||||
host_cnames[str(answer.name).rstrip('.')] = str(answer[0].target).rstrip('.')
|
||||
else:
|
||||
host_cnames[str(answer.name).rstrip('.')] = None
|
||||
|
||||
if answer.rdtype in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]:
|
||||
host_ips[str(answer.name).rstrip('.')] = list(set(str(b) for b in answer))
|
||||
except Exception:
|
||||
host_cnames[node.name] = None
|
||||
host_ips[node.name] = []
|
||||
cnames = _build_cname_chain(host_cnames, node.name)
|
||||
if cnames:
|
||||
node.add_feature('cname', cnames)
|
||||
if cnames[-1] in host_ips:
|
||||
node.add_feature('resolved_ips', host_ips[cnames[-1]])
|
||||
elif node.name in host_ips:
|
||||
node.add_feature('resolved_ips', host_ips[node.name])
|
||||
|
||||
with cnames_path.open('w') as f:
|
||||
json.dump(host_cnames, f)
|
||||
with ips_path.open('w') as f:
|
||||
json.dump(host_ips, f)
|
||||
return ct
|
||||
return self._captures_index[capture_uuid].capture_dir
|
||||
|
||||
def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
|
||||
legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
|
||||
|
@ -338,8 +105,7 @@ class Lookyloo():
|
|||
|
||||
def remove_pickle(self, capture_uuid: str, /) -> None:
|
||||
'''Remove the pickle from a specific capture.'''
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
remove_pickle_tree(capture_dir)
|
||||
self._captures_index.remove_pickle(capture_uuid)
|
||||
|
||||
def rebuild_cache(self) -> None:
|
||||
'''Flush and rebuild the redis cache. Doesn't remove the pickles.
|
||||
|
@ -349,8 +115,7 @@ class Lookyloo():
|
|||
def rebuild_all(self) -> None:
|
||||
'''Flush and rebuild the redis cache, and delete all the pickles.
|
||||
The captures will be rebuilt by the background indexer'''
|
||||
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()] # type: ignore
|
||||
self.rebuild_cache()
|
||||
self._captures_index.rebuild_all()
|
||||
|
||||
def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
|
||||
'''Get a URL node from a tree, by UUID'''
|
||||
|
@ -515,11 +280,9 @@ class Lookyloo():
|
|||
"""Add the capture in the hidden pool (not shown on the front page)
|
||||
NOTE: it won't remove the correlations until they are rebuilt.
|
||||
"""
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
self.redis.hset(str(capture_dir), 'no_index', 1)
|
||||
(capture_dir / 'no_index').touch()
|
||||
if capture_uuid in self._captures_index:
|
||||
self._captures_index[capture_uuid].no_index = True
|
||||
self.redis.hset(str(self._get_capture_dir(capture_uuid)), 'no_index', 1)
|
||||
(self._get_capture_dir(capture_uuid) / 'no_index').touch()
|
||||
self._captures_index.reload_cache(capture_uuid)
|
||||
|
||||
@property
|
||||
def capture_uuids(self) -> List[str]:
|
||||
|
@ -535,31 +298,7 @@ class Lookyloo():
|
|||
# No captures at all on the instance
|
||||
return []
|
||||
|
||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids
|
||||
if (uuid in self._captures_index
|
||||
and not self._captures_index[uuid].incomplete_redirects)]
|
||||
|
||||
captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
|
||||
if captures_to_get:
|
||||
p = self.redis.pipeline()
|
||||
for directory in self.redis.hmget('lookup_dirs', *captures_to_get):
|
||||
if not directory:
|
||||
continue
|
||||
p.hgetall(directory)
|
||||
for uuid, c in zip(captures_to_get, p.execute()):
|
||||
try:
|
||||
if not c:
|
||||
c = self.capture_cache(uuid)
|
||||
if not c:
|
||||
continue
|
||||
else:
|
||||
c = CaptureCache(c)
|
||||
except LookylooException as e:
|
||||
self.logger.warning(e)
|
||||
continue
|
||||
if hasattr(c, 'timestamp'):
|
||||
all_cache.append(c)
|
||||
self._captures_index[c.uuid] = c
|
||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)]
|
||||
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||
return all_cache
|
||||
|
||||
|
@ -577,15 +316,8 @@ class Lookyloo():
|
|||
|
||||
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
|
||||
"""Get the cache from redis."""
|
||||
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
|
||||
return self._captures_index[capture_uuid]
|
||||
try:
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
cached = self.redis.hgetall(str(capture_dir))
|
||||
if not cached or cached.get('incomplete_redirects') == '1':
|
||||
self._set_capture_cache(capture_dir)
|
||||
else:
|
||||
self._captures_index[capture_uuid] = CaptureCache(cached)
|
||||
return self._captures_index[capture_uuid]
|
||||
except MissingCaptureDirectory as e:
|
||||
# The UUID is in the captures but the directory is not on the disk.
|
||||
self.logger.warning(e)
|
||||
|
@ -600,17 +332,11 @@ class Lookyloo():
|
|||
except Exception as e:
|
||||
self.logger.critical(e)
|
||||
return None
|
||||
else:
|
||||
return self._captures_index[capture_uuid]
|
||||
|
||||
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
||||
'''Get the generated tree in ETE Toolkit format.
|
||||
Loads the pickle if it exists, creates it otherwise.'''
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
ct = load_pickle_tree(capture_dir)
|
||||
if not ct:
|
||||
ct = self._cache_capture(capture_uuid)
|
||||
return ct
|
||||
return self._captures_index[capture_uuid].tree
|
||||
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||
|
@ -923,14 +649,6 @@ class Lookyloo():
|
|||
if not cache:
|
||||
return {'error': 'UUID missing in cache, try again later.'}
|
||||
|
||||
if cache.incomplete_redirects:
|
||||
ct = self._cache_capture(capture_uuid)
|
||||
cache = self.capture_cache(capture_uuid)
|
||||
if not cache:
|
||||
return {'error': 'UUID missing in cache, try again later.'}
|
||||
else:
|
||||
ct = self.get_crawled_tree(capture_uuid)
|
||||
|
||||
event = MISPEvent()
|
||||
event.info = f'Lookyloo Capture ({cache.url})'
|
||||
lookyloo_link: MISPAttribute = event.add_attribute('link', f'https://{self.public_domain}/tree/{capture_uuid}') # type: ignore
|
||||
|
@ -939,7 +657,7 @@ class Lookyloo():
|
|||
|
||||
initial_url = URLObject(cache.url)
|
||||
initial_url.comment = 'Submitted URL'
|
||||
self.__misp_add_ips_to_URLObject(initial_url, ct.root_hartree.hostname_tree)
|
||||
self.__misp_add_ips_to_URLObject(initial_url, cache.tree.root_hartree.hostname_tree)
|
||||
|
||||
redirects: List[URLObject] = []
|
||||
for nb, url in enumerate(cache.redirects):
|
||||
|
@ -947,7 +665,7 @@ class Lookyloo():
|
|||
continue
|
||||
obj = URLObject(url)
|
||||
obj.comment = f'Redirect {nb}'
|
||||
self.__misp_add_ips_to_URLObject(obj, ct.root_hartree.hostname_tree)
|
||||
self.__misp_add_ips_to_URLObject(obj, cache.tree.root_hartree.hostname_tree)
|
||||
redirects.append(obj)
|
||||
if redirects:
|
||||
redirects[-1].comment = f'Last redirect ({nb})'
|
||||
|
@ -967,7 +685,7 @@ class Lookyloo():
|
|||
|
||||
screenshot: MISPAttribute = event.add_attribute('attachment', 'screenshot_landing_page.png', data=self.get_screenshot(capture_uuid), disable_correlation=True) # type: ignore
|
||||
try:
|
||||
fo = FileObject(pseudofile=ct.root_hartree.rendered_node.body, filename=ct.root_hartree.rendered_node.filename)
|
||||
fo = FileObject(pseudofile=cache.tree.root_hartree.rendered_node.body, filename=cache.tree.root_hartree.rendered_node.filename)
|
||||
fo.comment = 'Content received for the final redirect (before rendering)'
|
||||
fo.add_reference(final_redirect, 'loaded-by', 'URL loading that content')
|
||||
fo.add_reference(screenshot, 'rendered-as', 'Screenshot of the page')
|
||||
|
|
Loading…
Reference in New Issue