mirror of https://github.com/CIRCL/lookyloo
chg: Major refactoring, move capture code to external script.
parent
d8416f0f47
commit
bf700e7a7b
|
@ -1,10 +1,27 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import base64
|
||||
import ipaddress
|
||||
import json
|
||||
import logging
|
||||
import socket
|
||||
|
||||
from io import BufferedIOBase
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Union, Dict, Optional, Tuple, List
|
||||
from urllib.parse import urlsplit
|
||||
from uuid import uuid4
|
||||
|
||||
from defang import refang # type: ignore
|
||||
from redis import Redis
|
||||
from scrapysplashwrapper import crawl
|
||||
|
||||
from lookyloo.abstractmanager import AbstractManager
|
||||
from lookyloo.helpers import shutdown_requested
|
||||
from lookyloo.helpers import (shutdown_requested, splash_status, get_socket_path,
|
||||
load_cookies, safe_create_dir, get_config, get_splash_url,
|
||||
get_captures_dir)
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
|
@ -17,10 +34,159 @@ class AsyncCapture(AbstractManager):
|
|||
super().__init__(loglevel)
|
||||
self.lookyloo = Lookyloo()
|
||||
self.script_name = 'async_capture'
|
||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||
self.capture_dir: Path = get_captures_dir()
|
||||
self.splash_url: str = get_splash_url()
|
||||
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
|
||||
def process_capture_queue(self) -> Union[bool, None]:
|
||||
'''Process a query from the capture queue'''
|
||||
if not self.redis.exists('to_capture'):
|
||||
return None
|
||||
|
||||
status, message = splash_status()
|
||||
if not status:
|
||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||
return None
|
||||
|
||||
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
|
||||
if not value or not value[0]:
|
||||
return None
|
||||
uuid, score = value[0]
|
||||
queue: Optional[str] = self.redis.get(f'{uuid}_mgmt')
|
||||
self.redis.sadd('ongoing', uuid)
|
||||
|
||||
lazy_cleanup = self.redis.pipeline()
|
||||
lazy_cleanup.delete(f'{uuid}_mgmt')
|
||||
if queue:
|
||||
# queue shouldn't be none, but if it is, just ignore.
|
||||
lazy_cleanup.zincrby('queues', -1, queue)
|
||||
|
||||
to_capture: Dict[str, str] = self.redis.hgetall(uuid)
|
||||
to_capture['perma_uuid'] = uuid
|
||||
if 'cookies' in to_capture:
|
||||
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
||||
|
||||
status = self._capture(**to_capture) # type: ignore
|
||||
lazy_cleanup.srem('ongoing', uuid)
|
||||
lazy_cleanup.delete(uuid)
|
||||
# make sure to expire the key if nothing was process for a while (= queues empty)
|
||||
lazy_cleanup.expire('queues', 600)
|
||||
lazy_cleanup.execute()
|
||||
if status:
|
||||
self.logger.info(f'Processed {to_capture["url"]}')
|
||||
return True
|
||||
self.logger.warning(f'Unable to capture {to_capture["url"]}')
|
||||
return False
|
||||
|
||||
def _capture(self, url: str, *, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||
referer: str='', proxy: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
||||
browser: Optional[str]=None, parent: Optional[str]=None) -> Union[bool, str]:
|
||||
'''Launch a capture'''
|
||||
url = url.strip()
|
||||
url = refang(url)
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
if self.only_global_lookups:
|
||||
splitted_url = urlsplit(url)
|
||||
if splitted_url.netloc:
|
||||
if splitted_url.hostname:
|
||||
if splitted_url.hostname.split('.')[-1] != 'onion':
|
||||
try:
|
||||
ip = socket.gethostbyname(splitted_url.hostname)
|
||||
except socket.gaierror:
|
||||
self.logger.info('Name or service not known')
|
||||
return False
|
||||
if not ipaddress.ip_address(ip).is_global:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
cookies = load_cookies(cookies_pseudofile)
|
||||
if not user_agent:
|
||||
# Catch case where the UA is broken on the UI, and the async submission.
|
||||
ua: str = get_config('generic', 'default_user_agent')
|
||||
else:
|
||||
ua = user_agent
|
||||
|
||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||
depth = int(get_config('generic', 'max_depth'))
|
||||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
self.logger.info(f'Capturing {url}')
|
||||
try:
|
||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||
referer=referer, proxy=proxy, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
||||
except Exception as e:
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
raise e
|
||||
if not items:
|
||||
# broken
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
return False
|
||||
width = len(str(len(items)))
|
||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||
safe_create_dir(dirpath)
|
||||
|
||||
if os or browser:
|
||||
meta = {}
|
||||
if os:
|
||||
meta['os'] = os
|
||||
if browser:
|
||||
meta['browser'] = browser
|
||||
with (dirpath / 'meta').open('w') as _meta:
|
||||
json.dump(meta, _meta)
|
||||
|
||||
# Write UUID
|
||||
with (dirpath / 'uuid').open('w') as _uuid:
|
||||
_uuid.write(perma_uuid)
|
||||
|
||||
# Write no_index marker (optional)
|
||||
if not listing:
|
||||
(dirpath / 'no_index').touch()
|
||||
|
||||
# Write parent UUID (optional)
|
||||
if parent:
|
||||
with (dirpath / 'parent').open('w') as _parent:
|
||||
_parent.write(parent)
|
||||
|
||||
for i, item in enumerate(items):
|
||||
if 'error' in item:
|
||||
with (dirpath / 'error.txt').open('w') as _error:
|
||||
json.dump(item['error'], _error)
|
||||
|
||||
# The capture went fine
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
html = item['html']
|
||||
last_redirect = item['last_redirected_url']
|
||||
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
||||
json.dump(harfile, _har)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
||||
_img.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
||||
_html.write(html)
|
||||
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
|
||||
_redir.write(last_redirect)
|
||||
|
||||
if 'childFrames' in item:
|
||||
child_frames = item['childFrames']
|
||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
|
||||
json.dump(child_frames, _iframes)
|
||||
|
||||
if 'cookies' in item:
|
||||
cookies = item['cookies']
|
||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||
return perma_uuid
|
||||
|
||||
def _to_run_forever(self):
|
||||
while True:
|
||||
url = self.lookyloo.process_capture_queue()
|
||||
url = self.process_capture_queue()
|
||||
if url is None or shutdown_requested():
|
||||
break
|
||||
|
||||
|
|
|
@ -22,6 +22,9 @@ def main():
|
|||
print('Start background indexer...')
|
||||
Popen(['background_indexer'])
|
||||
print('done.')
|
||||
print('Start background processing...')
|
||||
Popen(['processing'])
|
||||
print('done.')
|
||||
print('Start website...')
|
||||
Popen(['start_website'])
|
||||
print('done.')
|
||||
|
|
|
@ -7,7 +7,8 @@ import json
|
|||
import traceback
|
||||
import pickle
|
||||
import pkg_resources
|
||||
from typing import List, Optional, Dict, Union, Any, Set
|
||||
from typing import List, Optional, Dict, Union, Any, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
from io import BufferedIOBase
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -19,6 +20,8 @@ from enum import IntEnum, unique
|
|||
from har2tree import CrawledTree, HostNode, URLNode
|
||||
from redis import Redis
|
||||
from redis.exceptions import ConnectionError
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from publicsuffix2 import PublicSuffixList, fetch # type: ignore
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
from pytaxonomies import Taxonomies
|
||||
|
@ -377,3 +380,38 @@ def reload_uuids_index() -> None:
|
|||
p.delete('lookup_dirs')
|
||||
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
|
||||
p.execute()
|
||||
|
||||
|
||||
def get_capture_status(capture_uuid: str, /) -> CaptureStatus:
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||
if r.zrank('to_capture', capture_uuid) is not None:
|
||||
return CaptureStatus.QUEUED
|
||||
elif r.hexists('lookup_dirs', capture_uuid):
|
||||
return CaptureStatus.DONE
|
||||
elif r.sismember('ongoing', capture_uuid):
|
||||
return CaptureStatus.ONGOING
|
||||
return CaptureStatus.UNKNOWN
|
||||
|
||||
|
||||
@lru_cache(64)
|
||||
def get_splash_url() -> str:
|
||||
if os.environ.get('SPLASH_URL_DOCKER'):
|
||||
# In order to have a working default for the docker image, it is easier to use an environment variable
|
||||
return os.environ['SPLASH_URL_DOCKER']
|
||||
else:
|
||||
return get_config('generic', 'splash_url')
|
||||
|
||||
|
||||
def splash_status() -> Tuple[bool, str]:
|
||||
try:
|
||||
splash_status = requests.get(urljoin(get_splash_url(), '_ping'))
|
||||
splash_status.raise_for_status()
|
||||
json_status = splash_status.json()
|
||||
if json_status['status'] == 'ok':
|
||||
return True, 'Splash is up'
|
||||
else:
|
||||
return False, str(json_status)
|
||||
except HTTPError as http_err:
|
||||
return False, f'HTTP error occurred: {http_err}'
|
||||
except Exception as err:
|
||||
return False, f'Other error occurred: {err}'
|
||||
|
|
|
@ -1,46 +1,37 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import base64
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, date
|
||||
from email.message import EmailMessage
|
||||
from io import BufferedIOBase, BytesIO
|
||||
import ipaddress
|
||||
from io import BytesIO
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
import smtplib
|
||||
import socket
|
||||
import sys
|
||||
from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable
|
||||
from urllib.parse import urlsplit, urljoin
|
||||
from uuid import uuid4
|
||||
from zipfile import ZipFile
|
||||
import operator
|
||||
import time
|
||||
|
||||
from defang import refang # type: ignore
|
||||
import dns.resolver
|
||||
import dns.rdatatype
|
||||
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
||||
from PIL import Image # type: ignore
|
||||
from pymisp import MISPEvent, MISPAttribute, MISPObject
|
||||
from pymisp.tools import URLObject, FileObject
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from redis import Redis, ConnectionPool
|
||||
from redis.connection import UnixDomainSocketConnection
|
||||
from scrapysplashwrapper import crawl
|
||||
from werkzeug.useragents import UserAgent
|
||||
|
||||
from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingCaptureDirectory
|
||||
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
|
||||
safe_create_dir, get_email_template, load_pickle_tree,
|
||||
from .helpers import (get_homedir, get_socket_path, get_config, get_email_template, load_pickle_tree,
|
||||
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
|
||||
CaptureStatus, try_make_file, get_captures_dir)
|
||||
try_make_file, get_captures_dir, get_splash_url)
|
||||
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois, UrlScan
|
||||
from .capturecache import CaptureCache
|
||||
from .context import Context
|
||||
|
@ -60,12 +51,7 @@ class Lookyloo():
|
|||
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('cache'), decode_responses=True)
|
||||
self.capture_dir: Path = get_captures_dir()
|
||||
if os.environ.get('SPLASH_URL_DOCKER'):
|
||||
# In order to have a working default for the docker image, it is easier to use an environment variable
|
||||
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
|
||||
else:
|
||||
self.splash_url = get_config('generic', 'splash_url')
|
||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||
self.splash_url: str = get_splash_url()
|
||||
|
||||
self._priority = get_config('generic', 'priority')
|
||||
|
||||
|
@ -101,11 +87,6 @@ class Lookyloo():
|
|||
def redis(self):
|
||||
return Redis(connection_pool=self.redis_pool)
|
||||
|
||||
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
|
||||
'''Cache the useragents of the visitors'''
|
||||
today = date.today().isoformat()
|
||||
self.redis.zincrby(f'user_agents|{today}', 1, f'{remote_ip}|{user_agent}')
|
||||
|
||||
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
||||
'''Use the cache to get a capture directory from a capture UUID'''
|
||||
capture_dir: Optional[Union[str, Path]]
|
||||
|
@ -135,6 +116,31 @@ class Lookyloo():
|
|||
|
||||
def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
|
||||
'''Generate the pickle, set the cache, add capture in the indexes'''
|
||||
|
||||
def _ensure_meta(capture_dir: Path, tree: CrawledTree) -> None:
|
||||
'''Make sure the meta file is present, it contains information about the User Agent used for the capture.'''
|
||||
metafile = capture_dir / 'meta'
|
||||
if metafile.exists():
|
||||
return
|
||||
ua = UserAgent(tree.root_hartree.user_agent)
|
||||
to_dump = {}
|
||||
if ua.platform:
|
||||
to_dump['os'] = ua.platform
|
||||
if ua.browser:
|
||||
if ua.version:
|
||||
to_dump['browser'] = f'{ua.browser} {ua.version}'
|
||||
else:
|
||||
to_dump['browser'] = ua.browser
|
||||
if ua.language:
|
||||
to_dump['language'] = ua.language
|
||||
|
||||
if not to_dump:
|
||||
# UA not recognized
|
||||
self.logger.info(f'Unable to recognize the User agent: {ua}')
|
||||
to_dump['user_agent'] = ua.string
|
||||
with metafile.open('w') as f:
|
||||
json.dump(to_dump, f)
|
||||
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
lock_file = capture_dir / 'lock'
|
||||
|
@ -160,7 +166,7 @@ class Lookyloo():
|
|||
index = True
|
||||
try:
|
||||
ct = CrawledTree(har_files, capture_uuid)
|
||||
self._ensure_meta(capture_dir, ct)
|
||||
_ensure_meta(capture_dir, ct)
|
||||
self._resolve_dns(ct)
|
||||
self.context.contextualize_tree(ct)
|
||||
cache = self.capture_cache(capture_uuid)
|
||||
|
@ -271,7 +277,13 @@ class Lookyloo():
|
|||
self._captures_index.pop(uuid, None)
|
||||
return cache
|
||||
|
||||
def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
|
||||
def _resolve_dns(self, ct: CrawledTree):
|
||||
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
||||
and store them in ips.json and cnames.json, in the capture directory.
|
||||
Updates the nodes of the tree accordingly so the information is available.
|
||||
'''
|
||||
|
||||
def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
|
||||
'''Returns a list of CNAMEs starting from one hostname.
|
||||
The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
|
||||
and the CNAME entry can have an other CNAME entry, and so on multiple times.
|
||||
|
@ -286,11 +298,6 @@ class Lookyloo():
|
|||
to_search = known_cnames[to_search]
|
||||
return cnames
|
||||
|
||||
def _resolve_dns(self, ct: CrawledTree):
|
||||
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
||||
and store them in ips.json and cnames.json, in the capture directory.
|
||||
Updates the nodes of the tree accordingly so the information is available.
|
||||
'''
|
||||
cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
|
||||
ips_path = ct.root_hartree.har.path.parent / 'ips.json'
|
||||
host_cnames: Dict[str, Optional[str]] = {}
|
||||
|
@ -319,7 +326,7 @@ class Lookyloo():
|
|||
except Exception:
|
||||
host_cnames[node.name] = None
|
||||
host_ips[node.name] = []
|
||||
cnames = self._build_cname_chain(host_cnames, node.name)
|
||||
cnames = _build_cname_chain(host_cnames, node.name)
|
||||
if cnames:
|
||||
node.add_feature('cname', cnames)
|
||||
if cnames[-1] in host_ips:
|
||||
|
@ -494,24 +501,6 @@ class Lookyloo():
|
|||
to_return['urlscan']['result'] = result
|
||||
return to_return
|
||||
|
||||
def get_misp_occurrences(self, capture_uuid: str, /) -> Optional[Dict[str, Set[str]]]:
|
||||
if not self.misp.available:
|
||||
return None
|
||||
try:
|
||||
ct = self.get_crawled_tree(capture_uuid)
|
||||
except LookylooException:
|
||||
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_uuid}) is cached.')
|
||||
return None
|
||||
nodes_to_lookup = ct.root_hartree.rendered_node.get_ancestors() + [ct.root_hartree.rendered_node]
|
||||
to_return: Dict[str, Set[str]] = defaultdict(set)
|
||||
for node in nodes_to_lookup:
|
||||
hits = self.misp.lookup(node, ct.root_hartree.get_host_node_by_uuid(node.hostnode_uuid))
|
||||
for event_id, values in hits.items():
|
||||
if not isinstance(values, set):
|
||||
continue
|
||||
to_return[event_id].update(values)
|
||||
return to_return
|
||||
|
||||
def hide_capture(self, capture_uuid: str, /) -> None:
|
||||
"""Add the capture in the hidden pool (not shown on the front page)
|
||||
NOTE: it won't remove the correlations until they are rebuilt.
|
||||
|
@ -594,17 +583,10 @@ class Lookyloo():
|
|||
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
|
||||
return ct
|
||||
|
||||
def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
|
||||
redis = self.redis # use a single connection
|
||||
if redis.zrank('to_capture', capture_uuid) is not None:
|
||||
return CaptureStatus.QUEUED
|
||||
elif redis.hexists('lookup_dirs', capture_uuid):
|
||||
return CaptureStatus.DONE
|
||||
elif redis.sismember('ongoing', capture_uuid):
|
||||
return CaptureStatus.ONGOING
|
||||
return CaptureStatus.UNKNOWN
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||
|
||||
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
|
||||
def _get_priority(source: str, user: str, authenticated: bool) -> int:
|
||||
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
||||
if not authenticated:
|
||||
usr_prio = self._priority['users']['_default_anon']
|
||||
|
@ -617,8 +599,7 @@ class Lookyloo():
|
|||
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
||||
return src_prio + usr_prio
|
||||
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||
priority = _get_priority(source, user, authenticated)
|
||||
perma_uuid = str(uuid4())
|
||||
p = self.redis.pipeline()
|
||||
for key, value in query.items():
|
||||
|
@ -627,53 +608,16 @@ class Lookyloo():
|
|||
query[key] = 1 if value else ''
|
||||
if isinstance(value, list):
|
||||
query[key] = json.dumps(value)
|
||||
if priority < -10:
|
||||
# Someone is probably abusing the system with useless URLs, remove them from the index
|
||||
query['listing'] = 0
|
||||
p.hmset(perma_uuid, query)
|
||||
priority = self._get_priority(source, user, authenticated)
|
||||
p.zadd('to_capture', {perma_uuid: priority})
|
||||
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
|
||||
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
|
||||
p.execute()
|
||||
return perma_uuid
|
||||
|
||||
def process_capture_queue(self) -> Union[bool, None]:
|
||||
'''Process a query from the capture queue'''
|
||||
redis = self.redis # use a single connection
|
||||
if not redis.exists('to_capture'):
|
||||
return None
|
||||
|
||||
status, message = self.splash_status()
|
||||
if not status:
|
||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||
return None
|
||||
|
||||
value = redis.zpopmax('to_capture')
|
||||
if not value or not value[0]:
|
||||
return None
|
||||
uuid, score = value[0]
|
||||
queue: str = redis.get(f'{uuid}_mgmt')
|
||||
redis.sadd('ongoing', uuid)
|
||||
|
||||
lazy_cleanup = redis.pipeline()
|
||||
lazy_cleanup.delete(f'{uuid}_mgmt')
|
||||
lazy_cleanup.zincrby('queues', -1, queue)
|
||||
|
||||
to_capture: Dict[str, Union[str, int, float]] = redis.hgetall(uuid)
|
||||
to_capture['perma_uuid'] = uuid
|
||||
if 'cookies' in to_capture:
|
||||
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
||||
|
||||
status = self._capture(**to_capture) # type: ignore
|
||||
lazy_cleanup.srem('ongoing', uuid)
|
||||
lazy_cleanup.delete(uuid)
|
||||
# make sure to expire the key if nothing was process for a while (= queues empty)
|
||||
lazy_cleanup.expire('queues', 600)
|
||||
lazy_cleanup.execute()
|
||||
if status:
|
||||
self.logger.info(f'Processed {to_capture["url"]}')
|
||||
return True
|
||||
self.logger.warning(f'Unable to capture {to_capture["url"]}')
|
||||
return False
|
||||
|
||||
def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None:
|
||||
'''Send an email notification regarding a specific capture'''
|
||||
if not get_config('generic', 'enable_mail_notification'):
|
||||
|
@ -716,30 +660,6 @@ class Lookyloo():
|
|||
self.logger.exception(e)
|
||||
self.logger.warning(msg.as_string())
|
||||
|
||||
def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
|
||||
'''Make sure the meta file is present, it contains information about the User Agent used for the capture.'''
|
||||
metafile = capture_dir / 'meta'
|
||||
if metafile.exists():
|
||||
return
|
||||
ua = UserAgent(tree.root_hartree.user_agent)
|
||||
to_dump = {}
|
||||
if ua.platform:
|
||||
to_dump['os'] = ua.platform
|
||||
if ua.browser:
|
||||
if ua.version:
|
||||
to_dump['browser'] = f'{ua.browser} {ua.version}'
|
||||
else:
|
||||
to_dump['browser'] = ua.browser
|
||||
if ua.language:
|
||||
to_dump['language'] = ua.language
|
||||
|
||||
if not to_dump:
|
||||
# UA not recognized
|
||||
self.logger.info(f'Unable to recognize the User agent: {ua}')
|
||||
to_dump['user_agent'] = ua.string
|
||||
with metafile.open('w') as f:
|
||||
json.dump(to_dump, f)
|
||||
|
||||
def _get_raw(self, capture_uuid: str, /, extension: str='*', all_files: bool=True) -> BytesIO:
|
||||
'''Get file(s) from the capture directory'''
|
||||
try:
|
||||
|
@ -806,125 +726,6 @@ class Lookyloo():
|
|||
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
|
||||
- set(ct.root_hartree.all_url_requests.keys()))
|
||||
|
||||
def splash_status(self) -> Tuple[bool, str]:
|
||||
try:
|
||||
splash_status = requests.get(urljoin(self.splash_url, '_ping'))
|
||||
splash_status.raise_for_status()
|
||||
json_status = splash_status.json()
|
||||
if json_status['status'] == 'ok':
|
||||
return True, 'Splash is up'
|
||||
else:
|
||||
return False, str(json_status)
|
||||
except HTTPError as http_err:
|
||||
return False, f'HTTP error occurred: {http_err}'
|
||||
except Exception as err:
|
||||
return False, f'Other error occurred: {err}'
|
||||
|
||||
def _capture(self, url: str, *, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||
referer: str='', proxy: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
||||
browser: Optional[str]=None, parent: Optional[str]=None) -> Union[bool, str]:
|
||||
'''Launch a capture'''
|
||||
url = url.strip()
|
||||
url = refang(url)
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
if self.only_global_lookups:
|
||||
splitted_url = urlsplit(url)
|
||||
if splitted_url.netloc:
|
||||
if splitted_url.hostname:
|
||||
if splitted_url.hostname.split('.')[-1] != 'onion':
|
||||
try:
|
||||
ip = socket.gethostbyname(splitted_url.hostname)
|
||||
except socket.gaierror:
|
||||
self.logger.info('Name or service not known')
|
||||
return False
|
||||
if not ipaddress.ip_address(ip).is_global:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
cookies = load_cookies(cookies_pseudofile)
|
||||
if not user_agent:
|
||||
# Catch case where the UA is broken on the UI, and the async submission.
|
||||
ua: str = get_config('generic', 'default_user_agent')
|
||||
else:
|
||||
ua = user_agent
|
||||
|
||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||
depth = int(get_config('generic', 'max_depth'))
|
||||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
self.logger.info(f'Capturing {url}')
|
||||
try:
|
||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||
referer=referer, proxy=proxy, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
||||
except Exception as e:
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
raise e
|
||||
if not items:
|
||||
# broken
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
return False
|
||||
width = len(str(len(items)))
|
||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||
safe_create_dir(dirpath)
|
||||
|
||||
if os or browser:
|
||||
meta = {}
|
||||
if os:
|
||||
meta['os'] = os
|
||||
if browser:
|
||||
meta['browser'] = browser
|
||||
with (dirpath / 'meta').open('w') as _meta:
|
||||
json.dump(meta, _meta)
|
||||
|
||||
# Write UUID
|
||||
with (dirpath / 'uuid').open('w') as _uuid:
|
||||
_uuid.write(perma_uuid)
|
||||
|
||||
# Write no_index marker (optional)
|
||||
if not listing:
|
||||
(dirpath / 'no_index').touch()
|
||||
|
||||
# Write parent UUID (optional)
|
||||
if parent:
|
||||
with (dirpath / 'parent').open('w') as _parent:
|
||||
_parent.write(parent)
|
||||
|
||||
for i, item in enumerate(items):
|
||||
if 'error' in item:
|
||||
with (dirpath / 'error.txt').open('w') as _error:
|
||||
json.dump(item['error'], _error)
|
||||
|
||||
# The capture went fine
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
html = item['html']
|
||||
last_redirect = item['last_redirected_url']
|
||||
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
||||
json.dump(harfile, _har)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
||||
_img.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
||||
_html.write(html)
|
||||
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
|
||||
_redir.write(last_redirect)
|
||||
|
||||
if 'childFrames' in item:
|
||||
child_frames = item['childFrames']
|
||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
|
||||
json.dump(child_frames, _iframes)
|
||||
|
||||
if 'cookies' in item:
|
||||
cookies = item['cookies']
|
||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||
return perma_uuid
|
||||
|
||||
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
||||
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
||||
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
||||
|
@ -1033,26 +834,6 @@ class Lookyloo():
|
|||
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
||||
return total_captures, captures_list
|
||||
|
||||
def _normalize_known_content(self, h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
|
||||
''' There are a few different sources to figure out known vs. legitimate content,
|
||||
this method normalize it for the web interface.'''
|
||||
known: Optional[Union[str, List[Any]]] = None
|
||||
legitimate: Optional[Tuple[bool, Any]] = None
|
||||
if h not in known_content:
|
||||
return known, legitimate
|
||||
|
||||
if known_content[h]['type'] in ['generic', 'sanejs']:
|
||||
known = known_content[h]['details']
|
||||
elif known_content[h]['type'] == 'legitimate_on_domain':
|
||||
legit = False
|
||||
if url.hostname in known_content[h]['details']:
|
||||
legit = True
|
||||
legitimate = (legit, known_content[h]['details'])
|
||||
elif known_content[h]['type'] == 'malicious':
|
||||
legitimate = (False, known_content[h]['details'])
|
||||
|
||||
return known, legitimate
|
||||
|
||||
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: Optional[str]) -> Optional[Tuple[str, BytesIO, str]]:
|
||||
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
|
||||
try:
|
||||
|
@ -1178,6 +959,24 @@ class Lookyloo():
|
|||
|
||||
return [event]
|
||||
|
||||
def get_misp_occurrences(self, capture_uuid: str, /) -> Optional[Dict[str, Set[str]]]:
|
||||
if not self.misp.available:
|
||||
return None
|
||||
try:
|
||||
ct = self.get_crawled_tree(capture_uuid)
|
||||
except LookylooException:
|
||||
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_uuid}) is cached.')
|
||||
return None
|
||||
nodes_to_lookup = ct.root_hartree.rendered_node.get_ancestors() + [ct.root_hartree.rendered_node]
|
||||
to_return: Dict[str, Set[str]] = defaultdict(set)
|
||||
for node in nodes_to_lookup:
|
||||
hits = self.misp.lookup(node, ct.root_hartree.get_host_node_by_uuid(node.hostnode_uuid))
|
||||
for event_id, values in hits.items():
|
||||
if not isinstance(values, set):
|
||||
continue
|
||||
to_return[event_id].update(values)
|
||||
return to_return
|
||||
|
||||
def get_hashes(self, tree_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None) -> Set[str]:
|
||||
"""Return hashes of resources.
|
||||
Only tree_uuid: All the hashes
|
||||
|
@ -1227,6 +1026,27 @@ class Lookyloo():
|
|||
|
||||
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
|
||||
'''Gather all the informations needed to display the Hostnode investigator popup.'''
|
||||
|
||||
def _normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
|
||||
''' There are a few different sources to figure out known vs. legitimate content,
|
||||
this method normalize it for the web interface.'''
|
||||
known: Optional[Union[str, List[Any]]] = None
|
||||
legitimate: Optional[Tuple[bool, Any]] = None
|
||||
if h not in known_content:
|
||||
return known, legitimate
|
||||
|
||||
if known_content[h]['type'] in ['generic', 'sanejs']:
|
||||
known = known_content[h]['details']
|
||||
elif known_content[h]['type'] == 'legitimate_on_domain':
|
||||
legit = False
|
||||
if url.hostname in known_content[h]['details']:
|
||||
legit = True
|
||||
legitimate = (legit, known_content[h]['details'])
|
||||
elif known_content[h]['type'] == 'malicious':
|
||||
legitimate = (False, known_content[h]['details'])
|
||||
|
||||
return known, legitimate
|
||||
|
||||
ct = self.get_crawled_tree(capture_uuid)
|
||||
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
||||
if not hostnode:
|
||||
|
@ -1270,13 +1090,13 @@ class Lookyloo():
|
|||
if freq_embedded['hash_freq'] > 1:
|
||||
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
|
||||
for h in to_append['embedded_ressources'].keys():
|
||||
known, legitimate = self._normalize_known_content(h, known_content, url)
|
||||
known, legitimate = _normalize_known_content(h, known_content, url)
|
||||
if known:
|
||||
to_append['embedded_ressources'][h]['known_content'] = known
|
||||
elif legitimate:
|
||||
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
|
||||
|
||||
known, legitimate = self._normalize_known_content(url.body_hash, known_content, url)
|
||||
known, legitimate = _normalize_known_content(url.body_hash, known_content, url)
|
||||
if known:
|
||||
to_append['known_content'] = known
|
||||
elif legitimate:
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from io import BytesIO, StringIO
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datetime import datetime, timedelta, timezone, date
|
||||
import json
|
||||
import http
|
||||
import calendar
|
||||
|
@ -21,13 +21,14 @@ from werkzeug.security import check_password_hash
|
|||
|
||||
from pymisp import MISPEvent, MISPServerError
|
||||
|
||||
from lookyloo.helpers import (get_user_agents, get_config,
|
||||
get_taxonomies, load_cookies, CaptureStatus)
|
||||
from lookyloo.helpers import (get_user_agents, get_config, get_taxonomies, load_cookies,
|
||||
CaptureStatus, splash_status, get_capture_status)
|
||||
from lookyloo.lookyloo import Lookyloo, Indexing
|
||||
from lookyloo.exceptions import NoValidHarFile, MissingUUID
|
||||
|
||||
from .proxied import ReverseProxied
|
||||
from .helpers import src_request_ip, User, load_user_from_request, build_users_table, get_secret_key, sri_load
|
||||
from .helpers import (src_request_ip, User, load_user_from_request, build_users_table,
|
||||
get_secret_key, sri_load)
|
||||
|
||||
app: Flask = Flask(__name__)
|
||||
app.wsgi_app = ReverseProxied(app.wsgi_app) # type: ignore
|
||||
|
@ -157,18 +158,20 @@ app.jinja_env.globals.update(get_sri=get_sri)
|
|||
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
if use_own_ua:
|
||||
# We keep a list user agents in order to build a list to use in the capture
|
||||
# interface: this is the easiest way to have something up to date.
|
||||
# The reason we also get the IP address of the client is because we
|
||||
# count the frequency of each user agents and use it to sort them on the
|
||||
# capture page, and we want to avoid counting the same user (same IP)
|
||||
# multiple times in a day.
|
||||
# The cache of IPs is deleted after the UA file is generated (see lookyloo.build_ua_file),
|
||||
# once a day.
|
||||
# The cache of IPs is deleted after the UA file is generated once a day.
|
||||
# See bin/background_processing.py
|
||||
ua = request.headers.get('User-Agent')
|
||||
real_ip = src_request_ip(request)
|
||||
if ua:
|
||||
lookyloo.cache_user_agents(ua, real_ip)
|
||||
today = date.today().isoformat()
|
||||
lookyloo.redis.zincrby(f'user_agents|{today}', 1, f'{real_ip}|{ua}')
|
||||
# Opt out of FLoC
|
||||
response.headers.set('Permissions-Policy', 'interest-cohort=()')
|
||||
return response
|
||||
|
@ -554,8 +557,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
|||
try:
|
||||
cache = lookyloo.capture_cache(tree_uuid)
|
||||
except MissingUUID:
|
||||
status = lookyloo.get_capture_status(tree_uuid)
|
||||
splash_up, splash_message = lookyloo.splash_status()
|
||||
status = get_capture_status(tree_uuid)
|
||||
splash_up, splash_message = splash_status()
|
||||
if not splash_up:
|
||||
flash(f'The capture module is not reachable ({splash_message}).', 'error')
|
||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||
|
@ -809,7 +812,7 @@ def capture_web():
|
|||
if 'bot' not in ua['useragent'].lower():
|
||||
default_ua = ua
|
||||
break
|
||||
splash_up, message = lookyloo.splash_status()
|
||||
splash_up, message = splash_status()
|
||||
if not splash_up:
|
||||
flash(f'The capture module is not reachable ({message}).', 'error')
|
||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||
|
|
|
@ -12,7 +12,8 @@ from werkzeug.security import check_password_hash
|
|||
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
|
||||
from .helpers import src_request_ip, load_user_from_request, build_users_table
|
||||
from .helpers import (src_request_ip, load_user_from_request, build_users_table)
|
||||
from lookyloo.helpers import splash_status, get_capture_status
|
||||
|
||||
api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')
|
||||
|
||||
|
@ -61,7 +62,7 @@ class AuthToken(Resource):
|
|||
@api.doc(description='Get status of splash.')
|
||||
class SplashStatus(Resource):
|
||||
def get(self):
|
||||
status, info = lookyloo.splash_status()
|
||||
status, info = splash_status()
|
||||
return {'is_up': status, 'info': info}
|
||||
|
||||
|
||||
|
@ -70,7 +71,7 @@ class SplashStatus(Resource):
|
|||
params={'capture_uuid': 'The UUID of the capture'})
|
||||
class CaptureStatusQuery(Resource):
|
||||
def get(self, capture_uuid: str):
|
||||
return {'status_code': lookyloo.get_capture_status(capture_uuid)}
|
||||
return {'status_code': get_capture_status(capture_uuid)}
|
||||
|
||||
|
||||
@api.route('/json/<string:capture_uuid>/hostnames')
|
||||
|
|
Loading…
Reference in New Issue