diff --git a/bin/start.py b/bin/start.py index b4e37982..b2b71eb1 100755 --- a/bin/start.py +++ b/bin/start.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from subprocess import run, Popen -from lookyloo.helpers import get_homedir +from lookyloo.helpers import get_homedir, get_config def main(): @@ -13,7 +13,8 @@ def main(): p.check_returncode() print('done.') print('Start asynchronous ingestor...') - Popen(['async_capture']) + for i in range(get_config('generic', 'async_capture_processes')): + Popen(['async_capture']) print('done.') print('Start background indexer...') Popen(['background_indexer']) diff --git a/config/generic.json.sample b/config/generic.json.sample index af2d568a..62bc477d 100644 --- a/config/generic.json.sample +++ b/config/generic.json.sample @@ -15,6 +15,7 @@ "hours": 0 }, "max_depth": 1, + "async_capture_processes": 1, "use_user_agents_users": false, "enable_default_blur_screenshot": false, "enable_context_by_users": false, @@ -42,6 +43,7 @@ "users": "It is some kind of an admin accounts. Format: {username: password}", "time_delta_on_index": "Time interval of the capture displayed on the index", "max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.", + "async_capture_processes": "Number of async_capture processes to start. This should not be higher than the number of splash instances you have running. A very high number will use *a lot* of ram.", "use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform", "enable_default_blur_screenshot": "If true, blur the screenshot by default (useful on public instances)", "enable_context_by_users": "Allow the users to add context to a response body", diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 20fc812d..c2184b10 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -16,7 +16,7 @@ import smtplib import socket import sys from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable -from urllib.parse import urlsplit +from urllib.parse import urlsplit, urljoin from uuid import uuid4 from zipfile import ZipFile import operator @@ -28,6 +28,8 @@ from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode from PIL import Image # type: ignore from pymisp import MISPEvent, MISPAttribute, MISPObject from pymisp.tools import URLObject, FileObject +import requests +from requests.exceptions import HTTPError from redis import Redis from scrapysplashwrapper import crawl from werkzeug.useragents import UserAgent @@ -584,6 +586,14 @@ class Lookyloo(): def process_capture_queue(self) -> Union[bool, None]: '''Process a query from the capture queue''' + if not self.redis.exists('to_capture'): + return None + + status, message = self.splash_status() + if not status: + self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}') + return None + uuid = self.redis.spop('to_capture') if not uuid: return None @@ -735,6 +745,20 @@ class Lookyloo(): return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page) - set(ct.root_hartree.all_url_requests.keys())) + def splash_status(self) -> Tuple[bool, str]: + try: + splash_status = requests.get(urljoin(self.splash_url, '_ping')) + splash_status.raise_for_status() + json_status = splash_status.json() + if json_status['status'] == 'ok': + return True, 'Splash is up' + else: + return False, str(json_status) + except HTTPError as http_err: + return False, f'HTTP error occurred: {http_err}' + except Exception as err: + return False, f'Other error occurred: {err}' + def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, @@ -768,6 +792,8 @@ class Lookyloo(): if int(depth) > int(get_config('generic', 'max_depth')): self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}') depth = int(get_config('generic', 'max_depth')) + if not perma_uuid: + perma_uuid = str(uuid4()) self.logger.info(f'Capturing {url}') try: items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, @@ -779,8 +805,6 @@ class Lookyloo(): # broken self.logger.critical(f'Something went terribly wrong when capturing {url}.') return False - if not perma_uuid: - perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = self.capture_dir / datetime.now().isoformat() safe_create_dir(dirpath) diff --git a/poetry.lock b/poetry.lock index f7a30145..a73fe84d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -188,7 +188,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "decorator" -version = "5.0.5" +version = "5.0.6" description = "Decorators for Humans" category = "dev" optional = false @@ -1314,8 +1314,8 @@ cssselect = [ {file = "cssselect-1.1.0.tar.gz", hash = "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"}, ] decorator = [ - {file = "decorator-5.0.5-py3-none-any.whl", hash = "sha256:b7157d62ea3c2c0c57b81a05e4569853e976a3dda5dd7a1cb86be78978c3c5f8"}, - {file = "decorator-5.0.5.tar.gz", hash = "sha256:acda948ffcfe4bd0c4a57834b74ad968b91925b8201b740ca9d46fb8c5c618ce"}, + {file = "decorator-5.0.6-py3-none-any.whl", hash = "sha256:d9f2d2863183a3c0df05f4b786f2e6b8752c093b3547a558f287bf3022fd2bf4"}, + {file = "decorator-5.0.6.tar.gz", hash = "sha256:f2e71efb39412bfd23d878e896a51b07744f2e2250b2e87d158e76828c5ae202"}, ] defang = [ {file = "defang-0.5.3.tar.gz", hash = "sha256:86aeff658d7cd4c3b61d16089872e1c1f0a1b7b3c64d4ca9525c017caeb284d7"}, diff --git a/website/web/__init__.py b/website/web/__init__.py index 23f88982..4468b544 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -13,6 +13,7 @@ from typing import Optional, Dict, Any, Union import logging import hashlib from urllib.parse import quote_plus, unquote_plus +import time from flask import Flask, render_template, request, send_file, redirect, url_for, Response, flash, jsonify from flask_bootstrap import Bootstrap # type: ignore @@ -658,29 +659,34 @@ def search(): @app.route('/capture', methods=['GET', 'POST']) def capture_web(): if request.form.get('url'): + capture_query = {'url': request.form.get('url')} # check if the post request has the file part if 'cookies' in request.files and request.files['cookies'].filename: - cookie_file = request.files['cookies'].stream - else: - cookie_file = None - url = request.form.get('url') + capture_query['cookies'] = request.files['cookies'].stream.read() + if request.form.get('personal_ua') and request.headers.get('User-Agent'): - user_agent = request.headers.get('User-Agent') - os = None - browser = None + capture_query['user_agent'] = request.headers.get('User-Agent') else: - user_agent = request.form.get('user_agent') - os = request.form.get('os') - browser = request.form.get('browser') - if url: - depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore - listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore - perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file, - depth=depth, listing=listing, - user_agent=user_agent, - referer=request.form.get('referer'), # type: ignore - os=os, browser=browser) - return redirect(url_for('tree', tree_uuid=perma_uuid)) + capture_query['user_agent'] = request.form.get('user_agent') + capture_query['os'] = request.form.get('os') + capture_query['browser'] = request.form.get('browser') + + if request.form.get('depth'): + capture_query['depth'] = request.form.get('depth') + else: + capture_query['depth'] = 1 + + if request.form.get('listing'): + capture_query['listing'] = True + else: + capture_query['listing'] = False + + if request.form.get('referer'): + capture_query['referer'] = request.form.get('referer') + + perma_uuid = lookyloo.enqueue_capture(capture_query) + time.sleep(30) + return redirect(url_for('tree', tree_uuid=perma_uuid)) user_agents: Dict[str, Any] = {} if use_own_ua: user_agents = get_user_agents('own_user_agents') @@ -691,6 +697,10 @@ def capture_web(): if 'bot' not in ua['useragent'].lower(): default_ua = ua break + splash_up, message = lookyloo.splash_status() + if not splash_up: + flash(f'The capture module is not reachable ({message}).', 'error') + flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error') return render_template('capture.html', user_agents=user_agents, default=default_ua, max_depth=max_depth, personal_ua=request.headers.get('User-Agent')) diff --git a/website/web/templates/capture.html b/website/web/templates/capture.html index 1820dc06..2f5e3d07 100644 --- a/website/web/templates/capture.html +++ b/website/web/templates/capture.html @@ -1,4 +1,5 @@ {% extends "main.html" %} +{% from 'bootstrap/utils.html' import render_messages %} {% block title %}Capture{% endblock %} {% block card %} @@ -28,6 +29,8 @@ + {{ render_messages(container=True, dismissible=True) }} +