mirror of https://github.com/CIRCL/lookyloo
chg: Major refactoring, move capture code to external script.
parent
d8416f0f47
commit
bf700e7a7b
|
@ -1,10 +1,27 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import ipaddress
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import socket
|
||||||
|
|
||||||
|
from io import BufferedIOBase
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union, Dict, Optional, Tuple, List
|
||||||
|
from urllib.parse import urlsplit
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from defang import refang # type: ignore
|
||||||
|
from redis import Redis
|
||||||
|
from scrapysplashwrapper import crawl
|
||||||
|
|
||||||
from lookyloo.abstractmanager import AbstractManager
|
from lookyloo.abstractmanager import AbstractManager
|
||||||
from lookyloo.helpers import shutdown_requested
|
from lookyloo.helpers import (shutdown_requested, splash_status, get_socket_path,
|
||||||
|
load_cookies, safe_create_dir, get_config, get_splash_url,
|
||||||
|
get_captures_dir)
|
||||||
from lookyloo.lookyloo import Lookyloo
|
from lookyloo.lookyloo import Lookyloo
|
||||||
|
|
||||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||||
|
@ -17,10 +34,159 @@ class AsyncCapture(AbstractManager):
|
||||||
super().__init__(loglevel)
|
super().__init__(loglevel)
|
||||||
self.lookyloo = Lookyloo()
|
self.lookyloo = Lookyloo()
|
||||||
self.script_name = 'async_capture'
|
self.script_name = 'async_capture'
|
||||||
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||||
|
self.capture_dir: Path = get_captures_dir()
|
||||||
|
self.splash_url: str = get_splash_url()
|
||||||
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
|
|
||||||
|
def process_capture_queue(self) -> Union[bool, None]:
|
||||||
|
'''Process a query from the capture queue'''
|
||||||
|
if not self.redis.exists('to_capture'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
status, message = splash_status()
|
||||||
|
if not status:
|
||||||
|
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
|
||||||
|
if not value or not value[0]:
|
||||||
|
return None
|
||||||
|
uuid, score = value[0]
|
||||||
|
queue: Optional[str] = self.redis.get(f'{uuid}_mgmt')
|
||||||
|
self.redis.sadd('ongoing', uuid)
|
||||||
|
|
||||||
|
lazy_cleanup = self.redis.pipeline()
|
||||||
|
lazy_cleanup.delete(f'{uuid}_mgmt')
|
||||||
|
if queue:
|
||||||
|
# queue shouldn't be none, but if it is, just ignore.
|
||||||
|
lazy_cleanup.zincrby('queues', -1, queue)
|
||||||
|
|
||||||
|
to_capture: Dict[str, str] = self.redis.hgetall(uuid)
|
||||||
|
to_capture['perma_uuid'] = uuid
|
||||||
|
if 'cookies' in to_capture:
|
||||||
|
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
||||||
|
|
||||||
|
status = self._capture(**to_capture) # type: ignore
|
||||||
|
lazy_cleanup.srem('ongoing', uuid)
|
||||||
|
lazy_cleanup.delete(uuid)
|
||||||
|
# make sure to expire the key if nothing was process for a while (= queues empty)
|
||||||
|
lazy_cleanup.expire('queues', 600)
|
||||||
|
lazy_cleanup.execute()
|
||||||
|
if status:
|
||||||
|
self.logger.info(f'Processed {to_capture["url"]}')
|
||||||
|
return True
|
||||||
|
self.logger.warning(f'Unable to capture {to_capture["url"]}')
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _capture(self, url: str, *, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||||
|
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||||
|
referer: str='', proxy: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
||||||
|
browser: Optional[str]=None, parent: Optional[str]=None) -> Union[bool, str]:
|
||||||
|
'''Launch a capture'''
|
||||||
|
url = url.strip()
|
||||||
|
url = refang(url)
|
||||||
|
if not url.startswith('http'):
|
||||||
|
url = f'http://{url}'
|
||||||
|
if self.only_global_lookups:
|
||||||
|
splitted_url = urlsplit(url)
|
||||||
|
if splitted_url.netloc:
|
||||||
|
if splitted_url.hostname:
|
||||||
|
if splitted_url.hostname.split('.')[-1] != 'onion':
|
||||||
|
try:
|
||||||
|
ip = socket.gethostbyname(splitted_url.hostname)
|
||||||
|
except socket.gaierror:
|
||||||
|
self.logger.info('Name or service not known')
|
||||||
|
return False
|
||||||
|
if not ipaddress.ip_address(ip).is_global:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
cookies = load_cookies(cookies_pseudofile)
|
||||||
|
if not user_agent:
|
||||||
|
# Catch case where the UA is broken on the UI, and the async submission.
|
||||||
|
ua: str = get_config('generic', 'default_user_agent')
|
||||||
|
else:
|
||||||
|
ua = user_agent
|
||||||
|
|
||||||
|
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||||
|
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||||
|
depth = int(get_config('generic', 'max_depth'))
|
||||||
|
if not perma_uuid:
|
||||||
|
perma_uuid = str(uuid4())
|
||||||
|
self.logger.info(f'Capturing {url}')
|
||||||
|
try:
|
||||||
|
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||||
|
referer=referer, proxy=proxy, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||||
|
raise e
|
||||||
|
if not items:
|
||||||
|
# broken
|
||||||
|
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||||
|
return False
|
||||||
|
width = len(str(len(items)))
|
||||||
|
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||||
|
safe_create_dir(dirpath)
|
||||||
|
|
||||||
|
if os or browser:
|
||||||
|
meta = {}
|
||||||
|
if os:
|
||||||
|
meta['os'] = os
|
||||||
|
if browser:
|
||||||
|
meta['browser'] = browser
|
||||||
|
with (dirpath / 'meta').open('w') as _meta:
|
||||||
|
json.dump(meta, _meta)
|
||||||
|
|
||||||
|
# Write UUID
|
||||||
|
with (dirpath / 'uuid').open('w') as _uuid:
|
||||||
|
_uuid.write(perma_uuid)
|
||||||
|
|
||||||
|
# Write no_index marker (optional)
|
||||||
|
if not listing:
|
||||||
|
(dirpath / 'no_index').touch()
|
||||||
|
|
||||||
|
# Write parent UUID (optional)
|
||||||
|
if parent:
|
||||||
|
with (dirpath / 'parent').open('w') as _parent:
|
||||||
|
_parent.write(parent)
|
||||||
|
|
||||||
|
for i, item in enumerate(items):
|
||||||
|
if 'error' in item:
|
||||||
|
with (dirpath / 'error.txt').open('w') as _error:
|
||||||
|
json.dump(item['error'], _error)
|
||||||
|
|
||||||
|
# The capture went fine
|
||||||
|
harfile = item['har']
|
||||||
|
png = base64.b64decode(item['png'])
|
||||||
|
html = item['html']
|
||||||
|
last_redirect = item['last_redirected_url']
|
||||||
|
|
||||||
|
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
||||||
|
json.dump(harfile, _har)
|
||||||
|
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
||||||
|
_img.write(png)
|
||||||
|
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
||||||
|
_html.write(html)
|
||||||
|
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
|
||||||
|
_redir.write(last_redirect)
|
||||||
|
|
||||||
|
if 'childFrames' in item:
|
||||||
|
child_frames = item['childFrames']
|
||||||
|
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
|
||||||
|
json.dump(child_frames, _iframes)
|
||||||
|
|
||||||
|
if 'cookies' in item:
|
||||||
|
cookies = item['cookies']
|
||||||
|
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||||
|
json.dump(cookies, _cookies)
|
||||||
|
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||||
|
return perma_uuid
|
||||||
|
|
||||||
def _to_run_forever(self):
|
def _to_run_forever(self):
|
||||||
while True:
|
while True:
|
||||||
url = self.lookyloo.process_capture_queue()
|
url = self.process_capture_queue()
|
||||||
if url is None or shutdown_requested():
|
if url is None or shutdown_requested():
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,9 @@ def main():
|
||||||
print('Start background indexer...')
|
print('Start background indexer...')
|
||||||
Popen(['background_indexer'])
|
Popen(['background_indexer'])
|
||||||
print('done.')
|
print('done.')
|
||||||
|
print('Start background processing...')
|
||||||
|
Popen(['processing'])
|
||||||
|
print('done.')
|
||||||
print('Start website...')
|
print('Start website...')
|
||||||
Popen(['start_website'])
|
Popen(['start_website'])
|
||||||
print('done.')
|
print('done.')
|
||||||
|
|
|
@ -7,7 +7,8 @@ import json
|
||||||
import traceback
|
import traceback
|
||||||
import pickle
|
import pickle
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
from typing import List, Optional, Dict, Union, Any, Set
|
from typing import List, Optional, Dict, Union, Any, Set, Tuple
|
||||||
|
from urllib.parse import urljoin
|
||||||
from io import BufferedIOBase
|
from io import BufferedIOBase
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
@ -19,6 +20,8 @@ from enum import IntEnum, unique
|
||||||
from har2tree import CrawledTree, HostNode, URLNode
|
from har2tree import CrawledTree, HostNode, URLNode
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
from redis.exceptions import ConnectionError
|
from redis.exceptions import ConnectionError
|
||||||
|
import requests
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
from publicsuffix2 import PublicSuffixList, fetch # type: ignore
|
from publicsuffix2 import PublicSuffixList, fetch # type: ignore
|
||||||
from bs4 import BeautifulSoup # type: ignore
|
from bs4 import BeautifulSoup # type: ignore
|
||||||
from pytaxonomies import Taxonomies
|
from pytaxonomies import Taxonomies
|
||||||
|
@ -377,3 +380,38 @@ def reload_uuids_index() -> None:
|
||||||
p.delete('lookup_dirs')
|
p.delete('lookup_dirs')
|
||||||
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
|
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
|
||||||
p.execute()
|
p.execute()
|
||||||
|
|
||||||
|
|
||||||
|
def get_capture_status(capture_uuid: str, /) -> CaptureStatus:
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||||
|
if r.zrank('to_capture', capture_uuid) is not None:
|
||||||
|
return CaptureStatus.QUEUED
|
||||||
|
elif r.hexists('lookup_dirs', capture_uuid):
|
||||||
|
return CaptureStatus.DONE
|
||||||
|
elif r.sismember('ongoing', capture_uuid):
|
||||||
|
return CaptureStatus.ONGOING
|
||||||
|
return CaptureStatus.UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(64)
|
||||||
|
def get_splash_url() -> str:
|
||||||
|
if os.environ.get('SPLASH_URL_DOCKER'):
|
||||||
|
# In order to have a working default for the docker image, it is easier to use an environment variable
|
||||||
|
return os.environ['SPLASH_URL_DOCKER']
|
||||||
|
else:
|
||||||
|
return get_config('generic', 'splash_url')
|
||||||
|
|
||||||
|
|
||||||
|
def splash_status() -> Tuple[bool, str]:
|
||||||
|
try:
|
||||||
|
splash_status = requests.get(urljoin(get_splash_url(), '_ping'))
|
||||||
|
splash_status.raise_for_status()
|
||||||
|
json_status = splash_status.json()
|
||||||
|
if json_status['status'] == 'ok':
|
||||||
|
return True, 'Splash is up'
|
||||||
|
else:
|
||||||
|
return False, str(json_status)
|
||||||
|
except HTTPError as http_err:
|
||||||
|
return False, f'HTTP error occurred: {http_err}'
|
||||||
|
except Exception as err:
|
||||||
|
return False, f'Other error occurred: {err}'
|
||||||
|
|
|
@ -1,46 +1,37 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
|
||||||
import base64
|
import base64
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from email.message import EmailMessage
|
from email.message import EmailMessage
|
||||||
from io import BufferedIOBase, BytesIO
|
from io import BytesIO
|
||||||
import ipaddress
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pickle
|
import pickle
|
||||||
import smtplib
|
import smtplib
|
||||||
import socket
|
|
||||||
import sys
|
import sys
|
||||||
from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable
|
from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable
|
||||||
from urllib.parse import urlsplit, urljoin
|
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
import operator
|
import operator
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from defang import refang # type: ignore
|
|
||||||
import dns.resolver
|
import dns.resolver
|
||||||
import dns.rdatatype
|
import dns.rdatatype
|
||||||
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
||||||
from PIL import Image # type: ignore
|
from PIL import Image # type: ignore
|
||||||
from pymisp import MISPEvent, MISPAttribute, MISPObject
|
from pymisp import MISPEvent, MISPAttribute, MISPObject
|
||||||
from pymisp.tools import URLObject, FileObject
|
from pymisp.tools import URLObject, FileObject
|
||||||
import requests
|
|
||||||
from requests.exceptions import HTTPError
|
|
||||||
from redis import Redis, ConnectionPool
|
from redis import Redis, ConnectionPool
|
||||||
from redis.connection import UnixDomainSocketConnection
|
from redis.connection import UnixDomainSocketConnection
|
||||||
from scrapysplashwrapper import crawl
|
|
||||||
from werkzeug.useragents import UserAgent
|
from werkzeug.useragents import UserAgent
|
||||||
|
|
||||||
from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingCaptureDirectory
|
from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingCaptureDirectory
|
||||||
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
|
from .helpers import (get_homedir, get_socket_path, get_config, get_email_template, load_pickle_tree,
|
||||||
safe_create_dir, get_email_template, load_pickle_tree,
|
|
||||||
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
|
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
|
||||||
CaptureStatus, try_make_file, get_captures_dir)
|
try_make_file, get_captures_dir, get_splash_url)
|
||||||
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois, UrlScan
|
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois, UrlScan
|
||||||
from .capturecache import CaptureCache
|
from .capturecache import CaptureCache
|
||||||
from .context import Context
|
from .context import Context
|
||||||
|
@ -60,12 +51,7 @@ class Lookyloo():
|
||||||
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||||
path=get_socket_path('cache'), decode_responses=True)
|
path=get_socket_path('cache'), decode_responses=True)
|
||||||
self.capture_dir: Path = get_captures_dir()
|
self.capture_dir: Path = get_captures_dir()
|
||||||
if os.environ.get('SPLASH_URL_DOCKER'):
|
self.splash_url: str = get_splash_url()
|
||||||
# In order to have a working default for the docker image, it is easier to use an environment variable
|
|
||||||
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
|
|
||||||
else:
|
|
||||||
self.splash_url = get_config('generic', 'splash_url')
|
|
||||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
|
||||||
|
|
||||||
self._priority = get_config('generic', 'priority')
|
self._priority = get_config('generic', 'priority')
|
||||||
|
|
||||||
|
@ -101,11 +87,6 @@ class Lookyloo():
|
||||||
def redis(self):
|
def redis(self):
|
||||||
return Redis(connection_pool=self.redis_pool)
|
return Redis(connection_pool=self.redis_pool)
|
||||||
|
|
||||||
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
|
|
||||||
'''Cache the useragents of the visitors'''
|
|
||||||
today = date.today().isoformat()
|
|
||||||
self.redis.zincrby(f'user_agents|{today}', 1, f'{remote_ip}|{user_agent}')
|
|
||||||
|
|
||||||
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
||||||
'''Use the cache to get a capture directory from a capture UUID'''
|
'''Use the cache to get a capture directory from a capture UUID'''
|
||||||
capture_dir: Optional[Union[str, Path]]
|
capture_dir: Optional[Union[str, Path]]
|
||||||
|
@ -135,6 +116,31 @@ class Lookyloo():
|
||||||
|
|
||||||
def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
|
def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
|
||||||
'''Generate the pickle, set the cache, add capture in the indexes'''
|
'''Generate the pickle, set the cache, add capture in the indexes'''
|
||||||
|
|
||||||
|
def _ensure_meta(capture_dir: Path, tree: CrawledTree) -> None:
|
||||||
|
'''Make sure the meta file is present, it contains information about the User Agent used for the capture.'''
|
||||||
|
metafile = capture_dir / 'meta'
|
||||||
|
if metafile.exists():
|
||||||
|
return
|
||||||
|
ua = UserAgent(tree.root_hartree.user_agent)
|
||||||
|
to_dump = {}
|
||||||
|
if ua.platform:
|
||||||
|
to_dump['os'] = ua.platform
|
||||||
|
if ua.browser:
|
||||||
|
if ua.version:
|
||||||
|
to_dump['browser'] = f'{ua.browser} {ua.version}'
|
||||||
|
else:
|
||||||
|
to_dump['browser'] = ua.browser
|
||||||
|
if ua.language:
|
||||||
|
to_dump['language'] = ua.language
|
||||||
|
|
||||||
|
if not to_dump:
|
||||||
|
# UA not recognized
|
||||||
|
self.logger.info(f'Unable to recognize the User agent: {ua}')
|
||||||
|
to_dump['user_agent'] = ua.string
|
||||||
|
with metafile.open('w') as f:
|
||||||
|
json.dump(to_dump, f)
|
||||||
|
|
||||||
capture_dir = self._get_capture_dir(capture_uuid)
|
capture_dir = self._get_capture_dir(capture_uuid)
|
||||||
har_files = sorted(capture_dir.glob('*.har'))
|
har_files = sorted(capture_dir.glob('*.har'))
|
||||||
lock_file = capture_dir / 'lock'
|
lock_file = capture_dir / 'lock'
|
||||||
|
@ -160,7 +166,7 @@ class Lookyloo():
|
||||||
index = True
|
index = True
|
||||||
try:
|
try:
|
||||||
ct = CrawledTree(har_files, capture_uuid)
|
ct = CrawledTree(har_files, capture_uuid)
|
||||||
self._ensure_meta(capture_dir, ct)
|
_ensure_meta(capture_dir, ct)
|
||||||
self._resolve_dns(ct)
|
self._resolve_dns(ct)
|
||||||
self.context.contextualize_tree(ct)
|
self.context.contextualize_tree(ct)
|
||||||
cache = self.capture_cache(capture_uuid)
|
cache = self.capture_cache(capture_uuid)
|
||||||
|
@ -271,26 +277,27 @@ class Lookyloo():
|
||||||
self._captures_index.pop(uuid, None)
|
self._captures_index.pop(uuid, None)
|
||||||
return cache
|
return cache
|
||||||
|
|
||||||
def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
|
|
||||||
'''Returns a list of CNAMEs starting from one hostname.
|
|
||||||
The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
|
|
||||||
and the CNAME entry can have an other CNAME entry, and so on multiple times.
|
|
||||||
This method loops over the hostnames until there are no CNAMES.'''
|
|
||||||
cnames: List[str] = []
|
|
||||||
to_search = hostname
|
|
||||||
while True:
|
|
||||||
if known_cnames.get(to_search) is None:
|
|
||||||
break
|
|
||||||
# At this point, known_cnames[to_search] must exist and be a str
|
|
||||||
cnames.append(known_cnames[to_search]) # type: ignore
|
|
||||||
to_search = known_cnames[to_search]
|
|
||||||
return cnames
|
|
||||||
|
|
||||||
def _resolve_dns(self, ct: CrawledTree):
|
def _resolve_dns(self, ct: CrawledTree):
|
||||||
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
||||||
and store them in ips.json and cnames.json, in the capture directory.
|
and store them in ips.json and cnames.json, in the capture directory.
|
||||||
Updates the nodes of the tree accordingly so the information is available.
|
Updates the nodes of the tree accordingly so the information is available.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
|
||||||
|
'''Returns a list of CNAMEs starting from one hostname.
|
||||||
|
The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
|
||||||
|
and the CNAME entry can have an other CNAME entry, and so on multiple times.
|
||||||
|
This method loops over the hostnames until there are no CNAMES.'''
|
||||||
|
cnames: List[str] = []
|
||||||
|
to_search = hostname
|
||||||
|
while True:
|
||||||
|
if known_cnames.get(to_search) is None:
|
||||||
|
break
|
||||||
|
# At this point, known_cnames[to_search] must exist and be a str
|
||||||
|
cnames.append(known_cnames[to_search]) # type: ignore
|
||||||
|
to_search = known_cnames[to_search]
|
||||||
|
return cnames
|
||||||
|
|
||||||
cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
|
cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
|
||||||
ips_path = ct.root_hartree.har.path.parent / 'ips.json'
|
ips_path = ct.root_hartree.har.path.parent / 'ips.json'
|
||||||
host_cnames: Dict[str, Optional[str]] = {}
|
host_cnames: Dict[str, Optional[str]] = {}
|
||||||
|
@ -319,7 +326,7 @@ class Lookyloo():
|
||||||
except Exception:
|
except Exception:
|
||||||
host_cnames[node.name] = None
|
host_cnames[node.name] = None
|
||||||
host_ips[node.name] = []
|
host_ips[node.name] = []
|
||||||
cnames = self._build_cname_chain(host_cnames, node.name)
|
cnames = _build_cname_chain(host_cnames, node.name)
|
||||||
if cnames:
|
if cnames:
|
||||||
node.add_feature('cname', cnames)
|
node.add_feature('cname', cnames)
|
||||||
if cnames[-1] in host_ips:
|
if cnames[-1] in host_ips:
|
||||||
|
@ -494,24 +501,6 @@ class Lookyloo():
|
||||||
to_return['urlscan']['result'] = result
|
to_return['urlscan']['result'] = result
|
||||||
return to_return
|
return to_return
|
||||||
|
|
||||||
def get_misp_occurrences(self, capture_uuid: str, /) -> Optional[Dict[str, Set[str]]]:
|
|
||||||
if not self.misp.available:
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
ct = self.get_crawled_tree(capture_uuid)
|
|
||||||
except LookylooException:
|
|
||||||
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_uuid}) is cached.')
|
|
||||||
return None
|
|
||||||
nodes_to_lookup = ct.root_hartree.rendered_node.get_ancestors() + [ct.root_hartree.rendered_node]
|
|
||||||
to_return: Dict[str, Set[str]] = defaultdict(set)
|
|
||||||
for node in nodes_to_lookup:
|
|
||||||
hits = self.misp.lookup(node, ct.root_hartree.get_host_node_by_uuid(node.hostnode_uuid))
|
|
||||||
for event_id, values in hits.items():
|
|
||||||
if not isinstance(values, set):
|
|
||||||
continue
|
|
||||||
to_return[event_id].update(values)
|
|
||||||
return to_return
|
|
||||||
|
|
||||||
def hide_capture(self, capture_uuid: str, /) -> None:
|
def hide_capture(self, capture_uuid: str, /) -> None:
|
||||||
"""Add the capture in the hidden pool (not shown on the front page)
|
"""Add the capture in the hidden pool (not shown on the front page)
|
||||||
NOTE: it won't remove the correlations until they are rebuilt.
|
NOTE: it won't remove the correlations until they are rebuilt.
|
||||||
|
@ -594,31 +583,23 @@ class Lookyloo():
|
||||||
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
|
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
|
||||||
return ct
|
return ct
|
||||||
|
|
||||||
def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
|
|
||||||
redis = self.redis # use a single connection
|
|
||||||
if redis.zrank('to_capture', capture_uuid) is not None:
|
|
||||||
return CaptureStatus.QUEUED
|
|
||||||
elif redis.hexists('lookup_dirs', capture_uuid):
|
|
||||||
return CaptureStatus.DONE
|
|
||||||
elif redis.sismember('ongoing', capture_uuid):
|
|
||||||
return CaptureStatus.ONGOING
|
|
||||||
return CaptureStatus.UNKNOWN
|
|
||||||
|
|
||||||
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
|
|
||||||
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
|
||||||
if not authenticated:
|
|
||||||
usr_prio = self._priority['users']['_default_anon']
|
|
||||||
# reduce priority for anonymous users making lots of captures
|
|
||||||
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
|
||||||
if queue_size is None:
|
|
||||||
queue_size = 0
|
|
||||||
usr_prio -= int(queue_size / 10)
|
|
||||||
else:
|
|
||||||
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
|
||||||
return src_prio + usr_prio
|
|
||||||
|
|
||||||
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||||
|
|
||||||
|
def _get_priority(source: str, user: str, authenticated: bool) -> int:
|
||||||
|
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
||||||
|
if not authenticated:
|
||||||
|
usr_prio = self._priority['users']['_default_anon']
|
||||||
|
# reduce priority for anonymous users making lots of captures
|
||||||
|
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
||||||
|
if queue_size is None:
|
||||||
|
queue_size = 0
|
||||||
|
usr_prio -= int(queue_size / 10)
|
||||||
|
else:
|
||||||
|
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
||||||
|
return src_prio + usr_prio
|
||||||
|
|
||||||
|
priority = _get_priority(source, user, authenticated)
|
||||||
perma_uuid = str(uuid4())
|
perma_uuid = str(uuid4())
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
for key, value in query.items():
|
for key, value in query.items():
|
||||||
|
@ -627,53 +608,16 @@ class Lookyloo():
|
||||||
query[key] = 1 if value else ''
|
query[key] = 1 if value else ''
|
||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
query[key] = json.dumps(value)
|
query[key] = json.dumps(value)
|
||||||
|
if priority < -10:
|
||||||
|
# Someone is probably abusing the system with useless URLs, remove them from the index
|
||||||
|
query['listing'] = 0
|
||||||
p.hmset(perma_uuid, query)
|
p.hmset(perma_uuid, query)
|
||||||
priority = self._get_priority(source, user, authenticated)
|
|
||||||
p.zadd('to_capture', {perma_uuid: priority})
|
p.zadd('to_capture', {perma_uuid: priority})
|
||||||
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
|
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
|
||||||
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
|
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
|
||||||
p.execute()
|
p.execute()
|
||||||
return perma_uuid
|
return perma_uuid
|
||||||
|
|
||||||
def process_capture_queue(self) -> Union[bool, None]:
|
|
||||||
'''Process a query from the capture queue'''
|
|
||||||
redis = self.redis # use a single connection
|
|
||||||
if not redis.exists('to_capture'):
|
|
||||||
return None
|
|
||||||
|
|
||||||
status, message = self.splash_status()
|
|
||||||
if not status:
|
|
||||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
|
||||||
return None
|
|
||||||
|
|
||||||
value = redis.zpopmax('to_capture')
|
|
||||||
if not value or not value[0]:
|
|
||||||
return None
|
|
||||||
uuid, score = value[0]
|
|
||||||
queue: str = redis.get(f'{uuid}_mgmt')
|
|
||||||
redis.sadd('ongoing', uuid)
|
|
||||||
|
|
||||||
lazy_cleanup = redis.pipeline()
|
|
||||||
lazy_cleanup.delete(f'{uuid}_mgmt')
|
|
||||||
lazy_cleanup.zincrby('queues', -1, queue)
|
|
||||||
|
|
||||||
to_capture: Dict[str, Union[str, int, float]] = redis.hgetall(uuid)
|
|
||||||
to_capture['perma_uuid'] = uuid
|
|
||||||
if 'cookies' in to_capture:
|
|
||||||
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
|
||||||
|
|
||||||
status = self._capture(**to_capture) # type: ignore
|
|
||||||
lazy_cleanup.srem('ongoing', uuid)
|
|
||||||
lazy_cleanup.delete(uuid)
|
|
||||||
# make sure to expire the key if nothing was process for a while (= queues empty)
|
|
||||||
lazy_cleanup.expire('queues', 600)
|
|
||||||
lazy_cleanup.execute()
|
|
||||||
if status:
|
|
||||||
self.logger.info(f'Processed {to_capture["url"]}')
|
|
||||||
return True
|
|
||||||
self.logger.warning(f'Unable to capture {to_capture["url"]}')
|
|
||||||
return False
|
|
||||||
|
|
||||||
def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None:
|
def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None:
|
||||||
'''Send an email notification regarding a specific capture'''
|
'''Send an email notification regarding a specific capture'''
|
||||||
if not get_config('generic', 'enable_mail_notification'):
|
if not get_config('generic', 'enable_mail_notification'):
|
||||||
|
@ -716,30 +660,6 @@ class Lookyloo():
|
||||||
self.logger.exception(e)
|
self.logger.exception(e)
|
||||||
self.logger.warning(msg.as_string())
|
self.logger.warning(msg.as_string())
|
||||||
|
|
||||||
def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
|
|
||||||
'''Make sure the meta file is present, it contains information about the User Agent used for the capture.'''
|
|
||||||
metafile = capture_dir / 'meta'
|
|
||||||
if metafile.exists():
|
|
||||||
return
|
|
||||||
ua = UserAgent(tree.root_hartree.user_agent)
|
|
||||||
to_dump = {}
|
|
||||||
if ua.platform:
|
|
||||||
to_dump['os'] = ua.platform
|
|
||||||
if ua.browser:
|
|
||||||
if ua.version:
|
|
||||||
to_dump['browser'] = f'{ua.browser} {ua.version}'
|
|
||||||
else:
|
|
||||||
to_dump['browser'] = ua.browser
|
|
||||||
if ua.language:
|
|
||||||
to_dump['language'] = ua.language
|
|
||||||
|
|
||||||
if not to_dump:
|
|
||||||
# UA not recognized
|
|
||||||
self.logger.info(f'Unable to recognize the User agent: {ua}')
|
|
||||||
to_dump['user_agent'] = ua.string
|
|
||||||
with metafile.open('w') as f:
|
|
||||||
json.dump(to_dump, f)
|
|
||||||
|
|
||||||
def _get_raw(self, capture_uuid: str, /, extension: str='*', all_files: bool=True) -> BytesIO:
|
def _get_raw(self, capture_uuid: str, /, extension: str='*', all_files: bool=True) -> BytesIO:
|
||||||
'''Get file(s) from the capture directory'''
|
'''Get file(s) from the capture directory'''
|
||||||
try:
|
try:
|
||||||
|
@ -806,125 +726,6 @@ class Lookyloo():
|
||||||
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
|
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
|
||||||
- set(ct.root_hartree.all_url_requests.keys()))
|
- set(ct.root_hartree.all_url_requests.keys()))
|
||||||
|
|
||||||
def splash_status(self) -> Tuple[bool, str]:
|
|
||||||
try:
|
|
||||||
splash_status = requests.get(urljoin(self.splash_url, '_ping'))
|
|
||||||
splash_status.raise_for_status()
|
|
||||||
json_status = splash_status.json()
|
|
||||||
if json_status['status'] == 'ok':
|
|
||||||
return True, 'Splash is up'
|
|
||||||
else:
|
|
||||||
return False, str(json_status)
|
|
||||||
except HTTPError as http_err:
|
|
||||||
return False, f'HTTP error occurred: {http_err}'
|
|
||||||
except Exception as err:
|
|
||||||
return False, f'Other error occurred: {err}'
|
|
||||||
|
|
||||||
def _capture(self, url: str, *, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
|
||||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
|
||||||
referer: str='', proxy: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
|
||||||
browser: Optional[str]=None, parent: Optional[str]=None) -> Union[bool, str]:
|
|
||||||
'''Launch a capture'''
|
|
||||||
url = url.strip()
|
|
||||||
url = refang(url)
|
|
||||||
if not url.startswith('http'):
|
|
||||||
url = f'http://{url}'
|
|
||||||
if self.only_global_lookups:
|
|
||||||
splitted_url = urlsplit(url)
|
|
||||||
if splitted_url.netloc:
|
|
||||||
if splitted_url.hostname:
|
|
||||||
if splitted_url.hostname.split('.')[-1] != 'onion':
|
|
||||||
try:
|
|
||||||
ip = socket.gethostbyname(splitted_url.hostname)
|
|
||||||
except socket.gaierror:
|
|
||||||
self.logger.info('Name or service not known')
|
|
||||||
return False
|
|
||||||
if not ipaddress.ip_address(ip).is_global:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
cookies = load_cookies(cookies_pseudofile)
|
|
||||||
if not user_agent:
|
|
||||||
# Catch case where the UA is broken on the UI, and the async submission.
|
|
||||||
ua: str = get_config('generic', 'default_user_agent')
|
|
||||||
else:
|
|
||||||
ua = user_agent
|
|
||||||
|
|
||||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
|
||||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
|
||||||
depth = int(get_config('generic', 'max_depth'))
|
|
||||||
if not perma_uuid:
|
|
||||||
perma_uuid = str(uuid4())
|
|
||||||
self.logger.info(f'Capturing {url}')
|
|
||||||
try:
|
|
||||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
|
||||||
referer=referer, proxy=proxy, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
|
||||||
raise e
|
|
||||||
if not items:
|
|
||||||
# broken
|
|
||||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
|
||||||
return False
|
|
||||||
width = len(str(len(items)))
|
|
||||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
|
||||||
safe_create_dir(dirpath)
|
|
||||||
|
|
||||||
if os or browser:
|
|
||||||
meta = {}
|
|
||||||
if os:
|
|
||||||
meta['os'] = os
|
|
||||||
if browser:
|
|
||||||
meta['browser'] = browser
|
|
||||||
with (dirpath / 'meta').open('w') as _meta:
|
|
||||||
json.dump(meta, _meta)
|
|
||||||
|
|
||||||
# Write UUID
|
|
||||||
with (dirpath / 'uuid').open('w') as _uuid:
|
|
||||||
_uuid.write(perma_uuid)
|
|
||||||
|
|
||||||
# Write no_index marker (optional)
|
|
||||||
if not listing:
|
|
||||||
(dirpath / 'no_index').touch()
|
|
||||||
|
|
||||||
# Write parent UUID (optional)
|
|
||||||
if parent:
|
|
||||||
with (dirpath / 'parent').open('w') as _parent:
|
|
||||||
_parent.write(parent)
|
|
||||||
|
|
||||||
for i, item in enumerate(items):
|
|
||||||
if 'error' in item:
|
|
||||||
with (dirpath / 'error.txt').open('w') as _error:
|
|
||||||
json.dump(item['error'], _error)
|
|
||||||
|
|
||||||
# The capture went fine
|
|
||||||
harfile = item['har']
|
|
||||||
png = base64.b64decode(item['png'])
|
|
||||||
html = item['html']
|
|
||||||
last_redirect = item['last_redirected_url']
|
|
||||||
|
|
||||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
|
||||||
json.dump(harfile, _har)
|
|
||||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
|
||||||
_img.write(png)
|
|
||||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
|
||||||
_html.write(html)
|
|
||||||
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
|
|
||||||
_redir.write(last_redirect)
|
|
||||||
|
|
||||||
if 'childFrames' in item:
|
|
||||||
child_frames = item['childFrames']
|
|
||||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
|
|
||||||
json.dump(child_frames, _iframes)
|
|
||||||
|
|
||||||
if 'cookies' in item:
|
|
||||||
cookies = item['cookies']
|
|
||||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
|
||||||
json.dump(cookies, _cookies)
|
|
||||||
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
|
||||||
return perma_uuid
|
|
||||||
|
|
||||||
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
||||||
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
||||||
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
||||||
|
@ -1033,26 +834,6 @@ class Lookyloo():
|
||||||
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
||||||
return total_captures, captures_list
|
return total_captures, captures_list
|
||||||
|
|
||||||
def _normalize_known_content(self, h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
|
|
||||||
''' There are a few different sources to figure out known vs. legitimate content,
|
|
||||||
this method normalize it for the web interface.'''
|
|
||||||
known: Optional[Union[str, List[Any]]] = None
|
|
||||||
legitimate: Optional[Tuple[bool, Any]] = None
|
|
||||||
if h not in known_content:
|
|
||||||
return known, legitimate
|
|
||||||
|
|
||||||
if known_content[h]['type'] in ['generic', 'sanejs']:
|
|
||||||
known = known_content[h]['details']
|
|
||||||
elif known_content[h]['type'] == 'legitimate_on_domain':
|
|
||||||
legit = False
|
|
||||||
if url.hostname in known_content[h]['details']:
|
|
||||||
legit = True
|
|
||||||
legitimate = (legit, known_content[h]['details'])
|
|
||||||
elif known_content[h]['type'] == 'malicious':
|
|
||||||
legitimate = (False, known_content[h]['details'])
|
|
||||||
|
|
||||||
return known, legitimate
|
|
||||||
|
|
||||||
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: Optional[str]) -> Optional[Tuple[str, BytesIO, str]]:
|
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: Optional[str]) -> Optional[Tuple[str, BytesIO, str]]:
|
||||||
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
|
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
|
||||||
try:
|
try:
|
||||||
|
@ -1178,6 +959,24 @@ class Lookyloo():
|
||||||
|
|
||||||
return [event]
|
return [event]
|
||||||
|
|
||||||
|
def get_misp_occurrences(self, capture_uuid: str, /) -> Optional[Dict[str, Set[str]]]:
|
||||||
|
if not self.misp.available:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
|
except LookylooException:
|
||||||
|
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_uuid}) is cached.')
|
||||||
|
return None
|
||||||
|
nodes_to_lookup = ct.root_hartree.rendered_node.get_ancestors() + [ct.root_hartree.rendered_node]
|
||||||
|
to_return: Dict[str, Set[str]] = defaultdict(set)
|
||||||
|
for node in nodes_to_lookup:
|
||||||
|
hits = self.misp.lookup(node, ct.root_hartree.get_host_node_by_uuid(node.hostnode_uuid))
|
||||||
|
for event_id, values in hits.items():
|
||||||
|
if not isinstance(values, set):
|
||||||
|
continue
|
||||||
|
to_return[event_id].update(values)
|
||||||
|
return to_return
|
||||||
|
|
||||||
def get_hashes(self, tree_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None) -> Set[str]:
|
def get_hashes(self, tree_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None) -> Set[str]:
|
||||||
"""Return hashes of resources.
|
"""Return hashes of resources.
|
||||||
Only tree_uuid: All the hashes
|
Only tree_uuid: All the hashes
|
||||||
|
@ -1227,6 +1026,27 @@ class Lookyloo():
|
||||||
|
|
||||||
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
|
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
|
||||||
'''Gather all the informations needed to display the Hostnode investigator popup.'''
|
'''Gather all the informations needed to display the Hostnode investigator popup.'''
|
||||||
|
|
||||||
|
def _normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
|
||||||
|
''' There are a few different sources to figure out known vs. legitimate content,
|
||||||
|
this method normalize it for the web interface.'''
|
||||||
|
known: Optional[Union[str, List[Any]]] = None
|
||||||
|
legitimate: Optional[Tuple[bool, Any]] = None
|
||||||
|
if h not in known_content:
|
||||||
|
return known, legitimate
|
||||||
|
|
||||||
|
if known_content[h]['type'] in ['generic', 'sanejs']:
|
||||||
|
known = known_content[h]['details']
|
||||||
|
elif known_content[h]['type'] == 'legitimate_on_domain':
|
||||||
|
legit = False
|
||||||
|
if url.hostname in known_content[h]['details']:
|
||||||
|
legit = True
|
||||||
|
legitimate = (legit, known_content[h]['details'])
|
||||||
|
elif known_content[h]['type'] == 'malicious':
|
||||||
|
legitimate = (False, known_content[h]['details'])
|
||||||
|
|
||||||
|
return known, legitimate
|
||||||
|
|
||||||
ct = self.get_crawled_tree(capture_uuid)
|
ct = self.get_crawled_tree(capture_uuid)
|
||||||
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
||||||
if not hostnode:
|
if not hostnode:
|
||||||
|
@ -1270,13 +1090,13 @@ class Lookyloo():
|
||||||
if freq_embedded['hash_freq'] > 1:
|
if freq_embedded['hash_freq'] > 1:
|
||||||
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
|
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
|
||||||
for h in to_append['embedded_ressources'].keys():
|
for h in to_append['embedded_ressources'].keys():
|
||||||
known, legitimate = self._normalize_known_content(h, known_content, url)
|
known, legitimate = _normalize_known_content(h, known_content, url)
|
||||||
if known:
|
if known:
|
||||||
to_append['embedded_ressources'][h]['known_content'] = known
|
to_append['embedded_ressources'][h]['known_content'] = known
|
||||||
elif legitimate:
|
elif legitimate:
|
||||||
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
|
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
|
||||||
|
|
||||||
known, legitimate = self._normalize_known_content(url.body_hash, known_content, url)
|
known, legitimate = _normalize_known_content(url.body_hash, known_content, url)
|
||||||
if known:
|
if known:
|
||||||
to_append['known_content'] = known
|
to_append['known_content'] = known
|
||||||
elif legitimate:
|
elif legitimate:
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone, date
|
||||||
import json
|
import json
|
||||||
import http
|
import http
|
||||||
import calendar
|
import calendar
|
||||||
|
@ -21,13 +21,14 @@ from werkzeug.security import check_password_hash
|
||||||
|
|
||||||
from pymisp import MISPEvent, MISPServerError
|
from pymisp import MISPEvent, MISPServerError
|
||||||
|
|
||||||
from lookyloo.helpers import (get_user_agents, get_config,
|
from lookyloo.helpers import (get_user_agents, get_config, get_taxonomies, load_cookies,
|
||||||
get_taxonomies, load_cookies, CaptureStatus)
|
CaptureStatus, splash_status, get_capture_status)
|
||||||
from lookyloo.lookyloo import Lookyloo, Indexing
|
from lookyloo.lookyloo import Lookyloo, Indexing
|
||||||
from lookyloo.exceptions import NoValidHarFile, MissingUUID
|
from lookyloo.exceptions import NoValidHarFile, MissingUUID
|
||||||
|
|
||||||
from .proxied import ReverseProxied
|
from .proxied import ReverseProxied
|
||||||
from .helpers import src_request_ip, User, load_user_from_request, build_users_table, get_secret_key, sri_load
|
from .helpers import (src_request_ip, User, load_user_from_request, build_users_table,
|
||||||
|
get_secret_key, sri_load)
|
||||||
|
|
||||||
app: Flask = Flask(__name__)
|
app: Flask = Flask(__name__)
|
||||||
app.wsgi_app = ReverseProxied(app.wsgi_app) # type: ignore
|
app.wsgi_app = ReverseProxied(app.wsgi_app) # type: ignore
|
||||||
|
@ -157,18 +158,20 @@ app.jinja_env.globals.update(get_sri=get_sri)
|
||||||
|
|
||||||
@app.after_request
|
@app.after_request
|
||||||
def after_request(response):
|
def after_request(response):
|
||||||
# We keep a list user agents in order to build a list to use in the capture
|
if use_own_ua:
|
||||||
# interface: this is the easiest way to have something up to date.
|
# We keep a list user agents in order to build a list to use in the capture
|
||||||
# The reason we also get the IP address of the client is because we
|
# interface: this is the easiest way to have something up to date.
|
||||||
# count the frequency of each user agents and use it to sort them on the
|
# The reason we also get the IP address of the client is because we
|
||||||
# capture page, and we want to avoid counting the same user (same IP)
|
# count the frequency of each user agents and use it to sort them on the
|
||||||
# multiple times in a day.
|
# capture page, and we want to avoid counting the same user (same IP)
|
||||||
# The cache of IPs is deleted after the UA file is generated (see lookyloo.build_ua_file),
|
# multiple times in a day.
|
||||||
# once a day.
|
# The cache of IPs is deleted after the UA file is generated once a day.
|
||||||
ua = request.headers.get('User-Agent')
|
# See bin/background_processing.py
|
||||||
real_ip = src_request_ip(request)
|
ua = request.headers.get('User-Agent')
|
||||||
if ua:
|
real_ip = src_request_ip(request)
|
||||||
lookyloo.cache_user_agents(ua, real_ip)
|
if ua:
|
||||||
|
today = date.today().isoformat()
|
||||||
|
lookyloo.redis.zincrby(f'user_agents|{today}', 1, f'{real_ip}|{ua}')
|
||||||
# Opt out of FLoC
|
# Opt out of FLoC
|
||||||
response.headers.set('Permissions-Policy', 'interest-cohort=()')
|
response.headers.set('Permissions-Policy', 'interest-cohort=()')
|
||||||
return response
|
return response
|
||||||
|
@ -554,8 +557,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
||||||
try:
|
try:
|
||||||
cache = lookyloo.capture_cache(tree_uuid)
|
cache = lookyloo.capture_cache(tree_uuid)
|
||||||
except MissingUUID:
|
except MissingUUID:
|
||||||
status = lookyloo.get_capture_status(tree_uuid)
|
status = get_capture_status(tree_uuid)
|
||||||
splash_up, splash_message = lookyloo.splash_status()
|
splash_up, splash_message = splash_status()
|
||||||
if not splash_up:
|
if not splash_up:
|
||||||
flash(f'The capture module is not reachable ({splash_message}).', 'error')
|
flash(f'The capture module is not reachable ({splash_message}).', 'error')
|
||||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||||
|
@ -809,7 +812,7 @@ def capture_web():
|
||||||
if 'bot' not in ua['useragent'].lower():
|
if 'bot' not in ua['useragent'].lower():
|
||||||
default_ua = ua
|
default_ua = ua
|
||||||
break
|
break
|
||||||
splash_up, message = lookyloo.splash_status()
|
splash_up, message = splash_status()
|
||||||
if not splash_up:
|
if not splash_up:
|
||||||
flash(f'The capture module is not reachable ({message}).', 'error')
|
flash(f'The capture module is not reachable ({message}).', 'error')
|
||||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||||
|
|
|
@ -12,7 +12,8 @@ from werkzeug.security import check_password_hash
|
||||||
|
|
||||||
from lookyloo.lookyloo import Lookyloo
|
from lookyloo.lookyloo import Lookyloo
|
||||||
|
|
||||||
from .helpers import src_request_ip, load_user_from_request, build_users_table
|
from .helpers import (src_request_ip, load_user_from_request, build_users_table)
|
||||||
|
from lookyloo.helpers import splash_status, get_capture_status
|
||||||
|
|
||||||
api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')
|
api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/')
|
||||||
|
|
||||||
|
@ -61,7 +62,7 @@ class AuthToken(Resource):
|
||||||
@api.doc(description='Get status of splash.')
|
@api.doc(description='Get status of splash.')
|
||||||
class SplashStatus(Resource):
|
class SplashStatus(Resource):
|
||||||
def get(self):
|
def get(self):
|
||||||
status, info = lookyloo.splash_status()
|
status, info = splash_status()
|
||||||
return {'is_up': status, 'info': info}
|
return {'is_up': status, 'info': info}
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,7 +71,7 @@ class SplashStatus(Resource):
|
||||||
params={'capture_uuid': 'The UUID of the capture'})
|
params={'capture_uuid': 'The UUID of the capture'})
|
||||||
class CaptureStatusQuery(Resource):
|
class CaptureStatusQuery(Resource):
|
||||||
def get(self, capture_uuid: str):
|
def get(self, capture_uuid: str):
|
||||||
return {'status_code': lookyloo.get_capture_status(capture_uuid)}
|
return {'status_code': get_capture_status(capture_uuid)}
|
||||||
|
|
||||||
|
|
||||||
@api.route('/json/<string:capture_uuid>/hostnames')
|
@api.route('/json/<string:capture_uuid>/hostnames')
|
||||||
|
|
Loading…
Reference in New Issue