new: Use async capture for the UI.

Add a method to make sure splash is up before trying to capture.
pull/197/head
Raphaël Vinot 2021-04-08 19:15:53 +02:00
parent d78ee5de11
commit 7707d638cf
6 changed files with 67 additions and 27 deletions

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
from subprocess import run, Popen
from lookyloo.helpers import get_homedir
from lookyloo.helpers import get_homedir, get_config
def main():
@ -13,7 +13,8 @@ def main():
p.check_returncode()
print('done.')
print('Start asynchronous ingestor...')
Popen(['async_capture'])
for i in range(get_config('generic', 'async_capture_processes')):
Popen(['async_capture'])
print('done.')
print('Start background indexer...')
Popen(['background_indexer'])

View File

@ -15,6 +15,7 @@
"hours": 0
},
"max_depth": 1,
"async_capture_processes": 1,
"use_user_agents_users": false,
"enable_default_blur_screenshot": false,
"enable_context_by_users": false,
@ -42,6 +43,7 @@
"users": "It is some kind of an admin accounts. Format: {username: password}",
"time_delta_on_index": "Time interval of the capture displayed on the index",
"max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.",
"async_capture_processes": "Number of async_capture processes to start. This should not be higher than the number of splash instances you have running. A very high number will use *a lot* of ram.",
"use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform",
"enable_default_blur_screenshot": "If true, blur the screenshot by default (useful on public instances)",
"enable_context_by_users": "Allow the users to add context to a response body",

View File

@ -16,7 +16,7 @@ import smtplib
import socket
import sys
from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable
from urllib.parse import urlsplit
from urllib.parse import urlsplit, urljoin
from uuid import uuid4
from zipfile import ZipFile
import operator
@ -28,6 +28,8 @@ from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
from PIL import Image # type: ignore
from pymisp import MISPEvent, MISPAttribute, MISPObject
from pymisp.tools import URLObject, FileObject
import requests
from requests.exceptions import HTTPError
from redis import Redis
from scrapysplashwrapper import crawl
from werkzeug.useragents import UserAgent
@ -584,6 +586,14 @@ class Lookyloo():
def process_capture_queue(self) -> Union[bool, None]:
'''Process a query from the capture queue'''
if not self.redis.exists('to_capture'):
return None
status, message = self.splash_status()
if not status:
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
return None
uuid = self.redis.spop('to_capture')
if not uuid:
return None
@ -735,6 +745,20 @@ class Lookyloo():
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
- set(ct.root_hartree.all_url_requests.keys()))
def splash_status(self) -> Tuple[bool, str]:
try:
splash_status = requests.get(urljoin(self.splash_url, '_ping'))
splash_status.raise_for_status()
json_status = splash_status.json()
if json_status['status'] == 'ok':
return True, 'Splash is up'
else:
return False, str(json_status)
except HTTPError as http_err:
return False, f'HTTP error occurred: {http_err}'
except Exception as err:
return False, f'Other error occurred: {err}'
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
@ -768,6 +792,8 @@ class Lookyloo():
if int(depth) > int(get_config('generic', 'max_depth')):
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
depth = int(get_config('generic', 'max_depth'))
if not perma_uuid:
perma_uuid = str(uuid4())
self.logger.info(f'Capturing {url}')
try:
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
@ -779,8 +805,6 @@ class Lookyloo():
# broken
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
return False
if not perma_uuid:
perma_uuid = str(uuid4())
width = len(str(len(items)))
dirpath = self.capture_dir / datetime.now().isoformat()
safe_create_dir(dirpath)

6
poetry.lock generated
View File

@ -188,7 +188,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "decorator"
version = "5.0.5"
version = "5.0.6"
description = "Decorators for Humans"
category = "dev"
optional = false
@ -1314,8 +1314,8 @@ cssselect = [
{file = "cssselect-1.1.0.tar.gz", hash = "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"},
]
decorator = [
{file = "decorator-5.0.5-py3-none-any.whl", hash = "sha256:b7157d62ea3c2c0c57b81a05e4569853e976a3dda5dd7a1cb86be78978c3c5f8"},
{file = "decorator-5.0.5.tar.gz", hash = "sha256:acda948ffcfe4bd0c4a57834b74ad968b91925b8201b740ca9d46fb8c5c618ce"},
{file = "decorator-5.0.6-py3-none-any.whl", hash = "sha256:d9f2d2863183a3c0df05f4b786f2e6b8752c093b3547a558f287bf3022fd2bf4"},
{file = "decorator-5.0.6.tar.gz", hash = "sha256:f2e71efb39412bfd23d878e896a51b07744f2e2250b2e87d158e76828c5ae202"},
]
defang = [
{file = "defang-0.5.3.tar.gz", hash = "sha256:86aeff658d7cd4c3b61d16089872e1c1f0a1b7b3c64d4ca9525c017caeb284d7"},

View File

@ -13,6 +13,7 @@ from typing import Optional, Dict, Any, Union
import logging
import hashlib
from urllib.parse import quote_plus, unquote_plus
import time
from flask import Flask, render_template, request, send_file, redirect, url_for, Response, flash, jsonify
from flask_bootstrap import Bootstrap # type: ignore
@ -658,29 +659,34 @@ def search():
@app.route('/capture', methods=['GET', 'POST'])
def capture_web():
if request.form.get('url'):
capture_query = {'url': request.form.get('url')}
# check if the post request has the file part
if 'cookies' in request.files and request.files['cookies'].filename:
cookie_file = request.files['cookies'].stream
else:
cookie_file = None
url = request.form.get('url')
capture_query['cookies'] = request.files['cookies'].stream.read()
if request.form.get('personal_ua') and request.headers.get('User-Agent'):
user_agent = request.headers.get('User-Agent')
os = None
browser = None
capture_query['user_agent'] = request.headers.get('User-Agent')
else:
user_agent = request.form.get('user_agent')
os = request.form.get('os')
browser = request.form.get('browser')
if url:
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file,
depth=depth, listing=listing,
user_agent=user_agent,
referer=request.form.get('referer'), # type: ignore
os=os, browser=browser)
return redirect(url_for('tree', tree_uuid=perma_uuid))
capture_query['user_agent'] = request.form.get('user_agent')
capture_query['os'] = request.form.get('os')
capture_query['browser'] = request.form.get('browser')
if request.form.get('depth'):
capture_query['depth'] = request.form.get('depth')
else:
capture_query['depth'] = 1
if request.form.get('listing'):
capture_query['listing'] = True
else:
capture_query['listing'] = False
if request.form.get('referer'):
capture_query['referer'] = request.form.get('referer')
perma_uuid = lookyloo.enqueue_capture(capture_query)
time.sleep(30)
return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents: Dict[str, Any] = {}
if use_own_ua:
user_agents = get_user_agents('own_user_agents')
@ -691,6 +697,10 @@ def capture_web():
if 'bot' not in ua['useragent'].lower():
default_ua = ua
break
splash_up, message = lookyloo.splash_status()
if not splash_up:
flash(f'The capture module is not reachable ({message}).', 'error')
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
return render_template('capture.html', user_agents=user_agents, default=default_ua,
max_depth=max_depth, personal_ua=request.headers.get('User-Agent'))

View File

@ -1,4 +1,5 @@
{% extends "main.html" %}
{% from 'bootstrap/utils.html' import render_messages %}
{% block title %}Capture{% endblock %}
{% block card %}
@ -28,6 +29,8 @@
</a>
</center>
</br>
{{ render_messages(container=True, dismissible=True) }}
</br>
<form role="form" action="{{ url_for('capture_web') }}" method=post enctype=multipart/form-data>
<div class="form-group row">
<div class="col-sm-10">