new: Upload a file instead of submitting a URL.

Raphaël Vinot 2022-08-04 16:58:07 +02:00
parent 2ce8b5a96c
commit 72c4e43474
7 changed files with 129 additions and 2776 deletions

View File

@ -4,10 +4,13 @@ import asyncio
import ipaddress
import json
import logging
import os
import socket
from datetime import datetime
from io import BufferedIOBase
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import urlsplit
@ -37,18 +40,18 @@ class AsyncCapture(AbstractManager):
if not
self.logger.warning('Unable to setup the FOX module')
def thirdparty_submit(self, capture_data: Dict[str, str]) -> None:
def thirdparty_submit(self, url: str) -> None:
if['url'], auto_trigger=True), auto_trigger=True)
async def process_capture_queue(self) -> None:
'''Process a query from the capture queue'''
value: List[Tuple[str, float]] = await self.redis.zpopmax('to_capture')
value: List[Tuple[bytes, float]] = await self.redis.zpopmax('to_capture')
if not value or not value[0]:
# The queue was consumed by an other process.
uuid, _score = value[0]
queue: Optional[str] = await self.redis.get(f'{uuid}_mgmt')
uuid = value[0][0].decode()
queue: Optional[bytes] = await self.redis.get(f'{uuid}_mgmt')
await self.redis.sadd('ongoing', uuid)
async with self.redis.pipeline() as lazy_cleanup:
@ -57,55 +60,70 @@ class AsyncCapture(AbstractManager):
# queue shouldn't be none, but if it is, just ignore.
await lazy_cleanup.zincrby('queues', -1, queue)
to_capture: Dict[str, str] = await self.redis.hgetall(uuid)
to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid)
if get_config('generic', 'default_public'):
# By default, the captures are on the index, unless the user mark them as un-listed
listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True
listing = False if ('listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True
# By default, the captures are not on the index, unless the user mark them as listed
listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False
listing = True if ('listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False
# Turn the freetext for the headers into a dict
headers = {}
if 'headers' in to_capture:
for header_line in to_capture['headers'].splitlines():
headers: Dict[str, str] = {}
if b'headers' in to_capture:
for header_line in to_capture[b'headers'].decode().splitlines():
if header_line and ':' in header_line:
splitted = header_line.split(':', 1)
if splitted and len(splitted) == 2:
header, h_value = splitted
if header and h_value:
headers[header.strip()] = h_value.strip()
if to_capture.get('dnt'):
headers['DNT'] = to_capture['dnt']
if to_capture.get(b'dnt'):
headers['DNT'] = to_capture[b'dnt'].decode()'Capturing {to_capture["url"]} - {uuid}')
success, error_message = await self._capture(
cookies_pseudofile=to_capture.get('cookies', None),
user_agent=to_capture.get('user_agent', None),
referer=to_capture.get('referer', None),
headers=headers if headers else None,
proxy=to_capture.get('proxy', None),
os=to_capture.get('os', None),
browser=to_capture.get('browser', None),
parent=to_capture.get('parent', None)
if success:'Successfully captured {to_capture["url"]} - {uuid}')
if to_capture.get(b'document'):
# we do not have a URL yet.
document_name = Path(to_capture[b'document_name'].decode()).name
tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
with open(, "wb") as f:
url = f'file://{}'
self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}')
await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}')
url = to_capture[b'url'].decode()
self.thirdparty_submit(url)'Capturing {url} - {uuid}')
success, error_message = await self._capture(
cookies_pseudofile=to_capture.get(b'cookies', None),
user_agent=to_capture[b'user_agent'].decode() if to_capture.get(b'user_agent') else None,
referer=to_capture[b'referer'].decode() if to_capture.get(b'referer') else None,
headers=headers if headers else None,
proxy=to_capture[b'proxy'].decode() if to_capture.get(b'proxy') else None,
os=to_capture[b'os'].decode() if to_capture.get(b'os') else None,
browser=to_capture[b'browser'].decode() if to_capture.get(b'browser') else None,
parent=to_capture[b'parent'].decode() if to_capture.get(b'parent') else None
if to_capture.get(b'document'):
if success:'Successfully captured {url} - {uuid}')
self.logger.warning(f'Unable to capture {url} - {uuid}: {error_message}')
await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}')
await lazy_cleanup.srem('ongoing', uuid)
await lazy_cleanup.delete(uuid)
# make sure to expire the key if nothing was processed for a while (= queues empty)
await lazy_cleanup.expire('queues', 600)
await lazy_cleanup.execute()
async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
async def _capture(self, url: str, *, perma_uuid: str,
cookies_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None,
listing: bool=True, user_agent: Optional[str]=None,
referer: Optional[str]=None,
headers: Optional[Dict[str, str]]=None,
@ -114,7 +132,7 @@ class AsyncCapture(AbstractManager):
'''Launch a capture'''
url = url.strip()
url = refang(url)
if not url.startswith('http'):
if not url.startswith('data') and not url.startswith('http') and not url.startswith('file'):
url = f'http://{url}'
splitted_url = urlsplit(url)
if self.only_global_lookups:
@ -187,11 +205,11 @@ class AsyncCapture(AbstractManager):
if 'downloaded_filename' in entries and entries['downloaded_filename']:
with(dirpath / '').open('w') as _downloaded_filename:
with (dirpath / '').open('w') as _downloaded_filename:
if 'downloaded_file' in entries and entries['downloaded_file']:
with(dirpath / '').open('wb') as _downloaded_file:
with (dirpath / '').open('wb') as _downloaded_file:
if 'error' in entries:
@ -223,7 +241,7 @@ class AsyncCapture(AbstractManager):
return True, 'All good!'
async def _to_run_forever_async(self):
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'))
while await self.redis.exists('to_capture'):
await self.process_capture_queue()
if self.shutdown_requested():

View File

@ -129,14 +129,14 @@ def load_known_content(directory: str='known_content') -> Dict[str, Dict[str, An
return to_return
def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) -> List[Dict[str, Union[str, bool]]]:
def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None) -> List[Dict[str, Union[str, bool]]]:
cookies: List[Dict[str, Union[str, bool]]]
if cookie_pseudofile:
if isinstance(cookie_pseudofile, str):
if isinstance(cookie_pseudofile, (str, bytes)):
cookies = json.loads(cookie_pseudofile)
except json.decoder.JSONDecodeError:
logger.warning(f'Unable to load json content: {cookie_pseudofile}')
logger.warning(f'Unable to load json content: {cookie_pseudofile!r}')
return []
# Note: we might have an empty BytesIO, which is not False.

View File

@ -5,6 +5,7 @@ import hashlib
import json
import logging
import operator
import pickle
import smtplib
from collections import defaultdict
@ -398,11 +399,9 @@ class Lookyloo():
query[key] = 1 if value else 0
elif isinstance(value, (list, dict)):
query[key] = json.dumps(value)
elif isinstance(value, bytes):
query[key] = value.decode()
# dirty deduplicate
hash_query = hashlib.sha512(json.dumps(query).encode()).hexdigest()
hash_query = hashlib.sha512(pickle.dumps(query)).hexdigest()
# FIXME The line below should work, but it doesn't
# if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)):
if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')):

poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,9 @@ import json
import logging
import os
import time
import filetype
import filetype # type: ignore
from datetime import date, datetime, timedelta, timezone
from io import BytesIO, StringIO
from typing import Any, Dict, List, Optional, Union, TypedDict
@ -489,16 +491,20 @@ def image(tree_uuid: str):
return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png')
@app.route('/tree/<string:tree_uuid>/data', methods=['GET'])
def data(tree_uuid: str):
filename, data = lookyloo.get_data(tree_uuid)
if len(filename) != 0:
if filetype.guess_mime(data.getvalue()) is None:
mime = 'application/octet-stream'
mime = filetype.guess_mime(data.getvalue())
return send_file(data, mimetype= mime,
as_attachment=True, attachment_filename=filename)
if len(filename) == 0:
# TODO: return something saying it is not a valid request
if filetype.guess_mime(data.getvalue()) is None:
mime = 'application/octet-stream'
mime = filetype.guess_mime(data.getvalue())
return send_file(data, mimetype=mime,
as_attachment=True, attachment_filename=filename)
@app.route('/tree/<string:tree_uuid>/thumbnail/', defaults={'width': 64}, methods=['GET'])
@ -856,7 +862,11 @@ def capture_web():
user = src_request_ip(request)
if request.method == 'POST' and (request.form.get('url') or request.form.get('urls')):
if request.method == 'POST':
if not (request.form.get('url') or request.form.get('urls') or 'document' in request.files):
flash('Invalid submission: please submit at least a URL or a document.', 'error')
return _prepare_capture_template(user_ua=request.headers.get('User-Agent'))
capture_query: Dict[str, Union[str, bytes, int, bool]] = {}
# check if the post request has the file part
if 'cookies' in request.files and request.files['cookies'].filename:
@ -902,7 +912,7 @@ def capture_web():
perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
return redirect(url_for('tree', tree_uuid=perma_uuid))
elif request.form.get('urls'):
# bulk query
bulk_captures = []
for url in request.form['urls'].split('\n'):
@ -912,6 +922,13 @@ def capture_web():
bulk_captures.append((new_capture_uuid, url))
return render_template('bulk_captures.html', bulk_captures=bulk_captures)
elif 'document' in request.files:
# File upload
capture_query['document'] = request.files['document']
capture_query['document_name'] = request.files['document'].filename
perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
return redirect(url_for('tree', tree_uuid=perma_uuid))
elif request.method == 'GET' and request.args.get('url'):
url = unquote_plus(request.args['url']).strip()
capture_query = {'url': url}

View File

@ -326,7 +326,9 @@ class CaptureCookies(Resource):
# Just text
submit_fields_post = api.model('SubmitFieldsPost', {
'url': fields.Url(description="The URL to capture", required=True),
'url': fields.Url(description="The URL to capture"),
'document': fields.String(description="A base64 encoded document, it can be anything a browser can display."),
'document_name': fields.String(description="The name of the document."),
'listing': fields.Integer(description="Display the capture on the index", min=0, max=1, example=1),
'user_agent': fields.String(description="User agent to use for the capture", example=''),
'referer': fields.String(description="Referer to pass to the capture", example=''),
@ -376,6 +378,8 @@ class SubmitCapture(Resource):
user = src_request_ip(request)
to_query: Dict = request.get_json(force=True)
if 'document' in to_query:
to_query['document'] = base64.b64decode(to_query['document'])
perma_uuid = lookyloo.enqueue_capture(to_query, source='api', user=user, authenticated=flask_login.current_user.is_authenticated)
return perma_uuid

View File

@ -39,20 +39,45 @@
<div class="row input-group mb-3">
<label for="url" class="col-sm-1 col-form-label">URL:</label>
<input type="text" class="form-control col-auto" name="url" id=singleCaptureField
placeholder="URL to capture" value="{{predefined_url_to_capture}}" required>
<textarea class="form-control col-auto d-none" placeholder="URLs to capture, one per line"
name="urls" id=multipleCapturesField></textarea>
<span class="col-sm-2 input-group-text">
<div class="form-check">
<input class="form-check-input" name="multipleCaptures" id="multipleCaptures" type="checkbox"
value="" aria-label="tick to enable multiple captures">
<label for="multipleCaptures" class="form-check-label">Multiple captures</label>
<div class="nav nav-tabs" id="submission-type" role="tablist">
<button class="nav-link active" id="nav-url-tab" data-bs-toggle="tab" data-bs-target="#nav-url" type="button" role="tab" aria-current="nav-url" aria-selected="true" href="#">URL(s)</button>
<button class="nav-link" id="nav-doc-tab" data-bs-toggle="tab" data-bs-target="#nav-doc" type="button" role="tab" aria-current="nav-doc" aria-selected="false" href="#">Document</button>
<div class="tab-content" id="nav-tabContent">
<div class="tab-pane fade show active" id="nav-url" role="tabpanel" aria-labelledby="nav-url-tab">
<div class="row input-group mb-3">
<label for="url" class="col-sm-1 col-form-label">URL:</label>
<input type="text" class="form-control col-auto" name="url" id=singleCaptureField
placeholder="URL to capture" value="{{predefined_url_to_capture}}">
<textarea class="form-control col-auto d-none" placeholder="URLs to capture, one per line"
name="urls" id=multipleCapturesField></textarea>
<span class="col-sm-2 input-group-text">
<div class="form-check">
<input class="form-check-input" name="multipleCaptures" id="multipleCaptures" type="checkbox"
value="" aria-label="tick to enable multiple captures">
<label for="multipleCaptures" class="form-check-label">Multiple captures</label>
<div class="tab-pane fade" id="nav-doc" role="tabpanel" aria-labelledby="nav-doc-tab">
<div class="row mb-3">
<label for="document" class="col-sm-1 col-form-label">Document:</label>
<div class="col-sm-10">
<input type="file" class="form-control-file" id="document" name="document">
<div>Instead of a URL, you can upload a file. Preferably an HTML document, but it can be anything supported by a browser.</div>
<div class="dropdown-divider"></div>
<button class="btn btn-link" type="button" data-bs-toggle="collapse" data-bs-target="#collapseConfigBrowser"
@ -228,16 +253,12 @@
if (document.getElementById('multipleCaptures').checked == true) {
document.getElementById('singleCaptureField').value = '';
$("#multipleCapturesField").attr("required", true);
else {
document.getElementById('multipleCapturesField').value = '';
$("#singleCaptureField").attr("required", true);