mirror of https://github.com/CIRCL/lookyloo
new: Use async capture for the UI.
Add a method to make sure splash is up before trying to capture.pull/197/head
parent
d78ee5de11
commit
7707d638cf
|
@ -2,7 +2,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from subprocess import run, Popen
|
||||
from lookyloo.helpers import get_homedir
|
||||
from lookyloo.helpers import get_homedir, get_config
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -13,7 +13,8 @@ def main():
|
|||
p.check_returncode()
|
||||
print('done.')
|
||||
print('Start asynchronous ingestor...')
|
||||
Popen(['async_capture'])
|
||||
for i in range(get_config('generic', 'async_capture_processes')):
|
||||
Popen(['async_capture'])
|
||||
print('done.')
|
||||
print('Start background indexer...')
|
||||
Popen(['background_indexer'])
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
"hours": 0
|
||||
},
|
||||
"max_depth": 1,
|
||||
"async_capture_processes": 1,
|
||||
"use_user_agents_users": false,
|
||||
"enable_default_blur_screenshot": false,
|
||||
"enable_context_by_users": false,
|
||||
|
@ -42,6 +43,7 @@
|
|||
"users": "It is some kind of an admin accounts. Format: {username: password}",
|
||||
"time_delta_on_index": "Time interval of the capture displayed on the index",
|
||||
"max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.",
|
||||
"async_capture_processes": "Number of async_capture processes to start. This should not be higher than the number of splash instances you have running. A very high number will use *a lot* of ram.",
|
||||
"use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform",
|
||||
"enable_default_blur_screenshot": "If true, blur the screenshot by default (useful on public instances)",
|
||||
"enable_context_by_users": "Allow the users to add context to a response body",
|
||||
|
|
|
@ -16,7 +16,7 @@ import smtplib
|
|||
import socket
|
||||
import sys
|
||||
from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable
|
||||
from urllib.parse import urlsplit
|
||||
from urllib.parse import urlsplit, urljoin
|
||||
from uuid import uuid4
|
||||
from zipfile import ZipFile
|
||||
import operator
|
||||
|
@ -28,6 +28,8 @@ from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
|||
from PIL import Image # type: ignore
|
||||
from pymisp import MISPEvent, MISPAttribute, MISPObject
|
||||
from pymisp.tools import URLObject, FileObject
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from redis import Redis
|
||||
from scrapysplashwrapper import crawl
|
||||
from werkzeug.useragents import UserAgent
|
||||
|
@ -584,6 +586,14 @@ class Lookyloo():
|
|||
|
||||
def process_capture_queue(self) -> Union[bool, None]:
|
||||
'''Process a query from the capture queue'''
|
||||
if not self.redis.exists('to_capture'):
|
||||
return None
|
||||
|
||||
status, message = self.splash_status()
|
||||
if not status:
|
||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||
return None
|
||||
|
||||
uuid = self.redis.spop('to_capture')
|
||||
if not uuid:
|
||||
return None
|
||||
|
@ -735,6 +745,20 @@ class Lookyloo():
|
|||
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
|
||||
- set(ct.root_hartree.all_url_requests.keys()))
|
||||
|
||||
def splash_status(self) -> Tuple[bool, str]:
|
||||
try:
|
||||
splash_status = requests.get(urljoin(self.splash_url, '_ping'))
|
||||
splash_status.raise_for_status()
|
||||
json_status = splash_status.json()
|
||||
if json_status['status'] == 'ok':
|
||||
return True, 'Splash is up'
|
||||
else:
|
||||
return False, str(json_status)
|
||||
except HTTPError as http_err:
|
||||
return False, f'HTTP error occurred: {http_err}'
|
||||
except Exception as err:
|
||||
return False, f'Other error occurred: {err}'
|
||||
|
||||
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
||||
|
@ -768,6 +792,8 @@ class Lookyloo():
|
|||
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||
depth = int(get_config('generic', 'max_depth'))
|
||||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
self.logger.info(f'Capturing {url}')
|
||||
try:
|
||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||
|
@ -779,8 +805,6 @@ class Lookyloo():
|
|||
# broken
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
return False
|
||||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
width = len(str(len(items)))
|
||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||
safe_create_dir(dirpath)
|
||||
|
|
|
@ -188,7 +188,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
|||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.0.5"
|
||||
version = "5.0.6"
|
||||
description = "Decorators for Humans"
|
||||
category = "dev"
|
||||
optional = false
|
||||
|
@ -1314,8 +1314,8 @@ cssselect = [
|
|||
{file = "cssselect-1.1.0.tar.gz", hash = "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"},
|
||||
]
|
||||
decorator = [
|
||||
{file = "decorator-5.0.5-py3-none-any.whl", hash = "sha256:b7157d62ea3c2c0c57b81a05e4569853e976a3dda5dd7a1cb86be78978c3c5f8"},
|
||||
{file = "decorator-5.0.5.tar.gz", hash = "sha256:acda948ffcfe4bd0c4a57834b74ad968b91925b8201b740ca9d46fb8c5c618ce"},
|
||||
{file = "decorator-5.0.6-py3-none-any.whl", hash = "sha256:d9f2d2863183a3c0df05f4b786f2e6b8752c093b3547a558f287bf3022fd2bf4"},
|
||||
{file = "decorator-5.0.6.tar.gz", hash = "sha256:f2e71efb39412bfd23d878e896a51b07744f2e2250b2e87d158e76828c5ae202"},
|
||||
]
|
||||
defang = [
|
||||
{file = "defang-0.5.3.tar.gz", hash = "sha256:86aeff658d7cd4c3b61d16089872e1c1f0a1b7b3c64d4ca9525c017caeb284d7"},
|
||||
|
|
|
@ -13,6 +13,7 @@ from typing import Optional, Dict, Any, Union
|
|||
import logging
|
||||
import hashlib
|
||||
from urllib.parse import quote_plus, unquote_plus
|
||||
import time
|
||||
|
||||
from flask import Flask, render_template, request, send_file, redirect, url_for, Response, flash, jsonify
|
||||
from flask_bootstrap import Bootstrap # type: ignore
|
||||
|
@ -658,29 +659,34 @@ def search():
|
|||
@app.route('/capture', methods=['GET', 'POST'])
|
||||
def capture_web():
|
||||
if request.form.get('url'):
|
||||
capture_query = {'url': request.form.get('url')}
|
||||
# check if the post request has the file part
|
||||
if 'cookies' in request.files and request.files['cookies'].filename:
|
||||
cookie_file = request.files['cookies'].stream
|
||||
else:
|
||||
cookie_file = None
|
||||
url = request.form.get('url')
|
||||
capture_query['cookies'] = request.files['cookies'].stream.read()
|
||||
|
||||
if request.form.get('personal_ua') and request.headers.get('User-Agent'):
|
||||
user_agent = request.headers.get('User-Agent')
|
||||
os = None
|
||||
browser = None
|
||||
capture_query['user_agent'] = request.headers.get('User-Agent')
|
||||
else:
|
||||
user_agent = request.form.get('user_agent')
|
||||
os = request.form.get('os')
|
||||
browser = request.form.get('browser')
|
||||
if url:
|
||||
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
|
||||
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
|
||||
perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file,
|
||||
depth=depth, listing=listing,
|
||||
user_agent=user_agent,
|
||||
referer=request.form.get('referer'), # type: ignore
|
||||
os=os, browser=browser)
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
capture_query['user_agent'] = request.form.get('user_agent')
|
||||
capture_query['os'] = request.form.get('os')
|
||||
capture_query['browser'] = request.form.get('browser')
|
||||
|
||||
if request.form.get('depth'):
|
||||
capture_query['depth'] = request.form.get('depth')
|
||||
else:
|
||||
capture_query['depth'] = 1
|
||||
|
||||
if request.form.get('listing'):
|
||||
capture_query['listing'] = True
|
||||
else:
|
||||
capture_query['listing'] = False
|
||||
|
||||
if request.form.get('referer'):
|
||||
capture_query['referer'] = request.form.get('referer')
|
||||
|
||||
perma_uuid = lookyloo.enqueue_capture(capture_query)
|
||||
time.sleep(30)
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
user_agents: Dict[str, Any] = {}
|
||||
if use_own_ua:
|
||||
user_agents = get_user_agents('own_user_agents')
|
||||
|
@ -691,6 +697,10 @@ def capture_web():
|
|||
if 'bot' not in ua['useragent'].lower():
|
||||
default_ua = ua
|
||||
break
|
||||
splash_up, message = lookyloo.splash_status()
|
||||
if not splash_up:
|
||||
flash(f'The capture module is not reachable ({message}).', 'error')
|
||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||
return render_template('capture.html', user_agents=user_agents, default=default_ua,
|
||||
max_depth=max_depth, personal_ua=request.headers.get('User-Agent'))
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
{% extends "main.html" %}
|
||||
{% from 'bootstrap/utils.html' import render_messages %}
|
||||
{% block title %}Capture{% endblock %}
|
||||
|
||||
{% block card %}
|
||||
|
@ -28,6 +29,8 @@
|
|||
</a>
|
||||
</center>
|
||||
</br>
|
||||
{{ render_messages(container=True, dismissible=True) }}
|
||||
</br>
|
||||
<form role="form" action="{{ url_for('capture_web') }}" method=post enctype=multipart/form-data>
|
||||
<div class="form-group row">
|
||||
<div class="col-sm-10">
|
||||
|
|
Loading…
Reference in New Issue