diff --git a/bin/async_capture.py b/bin/async_capture.py index eb48b67..4457ca6 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -4,10 +4,13 @@ import asyncio import ipaddress import json import logging +import os import socket + from datetime import datetime from io import BufferedIOBase from pathlib import Path +from tempfile import NamedTemporaryFile from typing import Dict, List, Optional, Tuple, Union from urllib.parse import urlsplit @@ -37,18 +40,18 @@ class AsyncCapture(AbstractManager): if not self.fox.available: self.logger.warning('Unable to setup the FOX module') - def thirdparty_submit(self, capture_data: Dict[str, str]) -> None: + def thirdparty_submit(self, url: str) -> None: if self.fox.available: - self.fox.capture_default_trigger(capture_data['url'], auto_trigger=True) + self.fox.capture_default_trigger(url, auto_trigger=True) async def process_capture_queue(self) -> None: '''Process a query from the capture queue''' - value: List[Tuple[str, float]] = await self.redis.zpopmax('to_capture') + value: List[Tuple[bytes, float]] = await self.redis.zpopmax('to_capture') if not value or not value[0]: # The queue was consumed by an other process. return - uuid, _score = value[0] - queue: Optional[str] = await self.redis.get(f'{uuid}_mgmt') + uuid = value[0][0].decode() + queue: Optional[bytes] = await self.redis.get(f'{uuid}_mgmt') await self.redis.sadd('ongoing', uuid) async with self.redis.pipeline() as lazy_cleanup: @@ -57,55 +60,70 @@ class AsyncCapture(AbstractManager): # queue shouldn't be none, but if it is, just ignore. await lazy_cleanup.zincrby('queues', -1, queue) - to_capture: Dict[str, str] = await self.redis.hgetall(uuid) + to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid) if get_config('generic', 'default_public'): # By default, the captures are on the index, unless the user mark them as un-listed - listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True + listing = False if ('listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True else: # By default, the captures are not on the index, unless the user mark them as listed - listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False + listing = True if ('listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False # Turn the freetext for the headers into a dict - headers = {} - if 'headers' in to_capture: - for header_line in to_capture['headers'].splitlines(): + headers: Dict[str, str] = {} + if b'headers' in to_capture: + for header_line in to_capture[b'headers'].decode().splitlines(): if header_line and ':' in header_line: splitted = header_line.split(':', 1) if splitted and len(splitted) == 2: header, h_value = splitted if header and h_value: headers[header.strip()] = h_value.strip() - if to_capture.get('dnt'): - headers['DNT'] = to_capture['dnt'] + if to_capture.get(b'dnt'): + headers['DNT'] = to_capture[b'dnt'].decode() - self.logger.info(f'Capturing {to_capture["url"]} - {uuid}') - self.thirdparty_submit(to_capture) - success, error_message = await self._capture( - to_capture['url'], - perma_uuid=uuid, - cookies_pseudofile=to_capture.get('cookies', None), - listing=listing, - user_agent=to_capture.get('user_agent', None), - referer=to_capture.get('referer', None), - headers=headers if headers else None, - proxy=to_capture.get('proxy', None), - os=to_capture.get('os', None), - browser=to_capture.get('browser', None), - parent=to_capture.get('parent', None) - ) - if success: - self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}') + if to_capture.get(b'document'): + # we do not have a URL yet. + document_name = Path(to_capture[b'document_name'].decode()).name + tmp_f = NamedTemporaryFile(suffix=document_name, delete=False) + with open(tmp_f.name, "wb") as f: + f.write(to_capture[b'document']) + url = f'file://{tmp_f.name}' else: - self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}') - await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}') + url = to_capture[b'url'].decode() + self.thirdparty_submit(url) + + self.logger.info(f'Capturing {url} - {uuid}') + success, error_message = await self._capture( + url, + perma_uuid=uuid, + cookies_pseudofile=to_capture.get(b'cookies', None), + listing=listing, + user_agent=to_capture[b'user_agent'].decode() if to_capture.get(b'user_agent') else None, + referer=to_capture[b'referer'].decode() if to_capture.get(b'referer') else None, + headers=headers if headers else None, + proxy=to_capture[b'proxy'].decode() if to_capture.get(b'proxy') else None, + os=to_capture[b'os'].decode() if to_capture.get(b'os') else None, + browser=to_capture[b'browser'].decode() if to_capture.get(b'browser') else None, + parent=to_capture[b'parent'].decode() if to_capture.get(b'parent') else None + ) + + if to_capture.get(b'document'): + os.unlink(tmp_f.name) + + if success: + self.logger.info(f'Successfully captured {url} - {uuid}') + else: + self.logger.warning(f'Unable to capture {url} - {uuid}: {error_message}') + await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}') await lazy_cleanup.srem('ongoing', uuid) await lazy_cleanup.delete(uuid) # make sure to expire the key if nothing was processed for a while (= queues empty) await lazy_cleanup.expire('queues', 600) await lazy_cleanup.execute() - async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, + async def _capture(self, url: str, *, perma_uuid: str, + cookies_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None, listing: bool=True, user_agent: Optional[str]=None, referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None, @@ -114,7 +132,7 @@ class AsyncCapture(AbstractManager): '''Launch a capture''' url = url.strip() url = refang(url) - if not url.startswith('http'): + if not url.startswith('data') and not url.startswith('http') and not url.startswith('file'): url = f'http://{url}' splitted_url = urlsplit(url) if self.only_global_lookups: @@ -187,11 +205,11 @@ class AsyncCapture(AbstractManager): _parent.write(parent) if 'downloaded_filename' in entries and entries['downloaded_filename']: - with(dirpath / '0.data.filename').open('w') as _downloaded_filename: + with (dirpath / '0.data.filename').open('w') as _downloaded_filename: _downloaded_filename.write(entries['downloaded_filename']) if 'downloaded_file' in entries and entries['downloaded_file']: - with(dirpath / '0.data').open('wb') as _downloaded_file: + with (dirpath / '0.data').open('wb') as _downloaded_file: _downloaded_file.write(entries['downloaded_file']) if 'error' in entries: @@ -223,7 +241,7 @@ class AsyncCapture(AbstractManager): return True, 'All good!' async def _to_run_forever_async(self): - self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) + self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache')) while await self.redis.exists('to_capture'): await self.process_capture_queue() if self.shutdown_requested(): diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 8699a41..49294c5 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -129,14 +129,14 @@ def load_known_content(directory: str='known_content') -> Dict[str, Dict[str, An return to_return -def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) -> List[Dict[str, Union[str, bool]]]: +def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None) -> List[Dict[str, Union[str, bool]]]: cookies: List[Dict[str, Union[str, bool]]] if cookie_pseudofile: - if isinstance(cookie_pseudofile, str): + if isinstance(cookie_pseudofile, (str, bytes)): try: cookies = json.loads(cookie_pseudofile) except json.decoder.JSONDecodeError: - logger.warning(f'Unable to load json content: {cookie_pseudofile}') + logger.warning(f'Unable to load json content: {cookie_pseudofile!r}') return [] else: # Note: we might have an empty BytesIO, which is not False. diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 2974f82..11346aa 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -5,6 +5,7 @@ import hashlib import json import logging import operator +import pickle import smtplib from collections import defaultdict @@ -398,11 +399,9 @@ class Lookyloo(): query[key] = 1 if value else 0 elif isinstance(value, (list, dict)): query[key] = json.dumps(value) - elif isinstance(value, bytes): - query[key] = value.decode() # dirty deduplicate - hash_query = hashlib.sha512(json.dumps(query).encode()).hexdigest() + hash_query = hashlib.sha512(pickle.dumps(query)).hexdigest() # FIXME The line below should work, but it doesn't # if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)): if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')): diff --git a/poetry.lock b/poetry.lock index 53db7b2..050f345 100644 --- a/poetry.lock +++ b/poetry.lock @@ -403,7 +403,7 @@ tornado = ["tornado (>=0.2)"] [[package]] name = "har2tree" -version = "1.13.3" +version = "1.13.4" description = "HTTP Archive (HAR) to ETE Toolkit generator" category = "main" optional = false @@ -1220,7 +1220,7 @@ types-urllib3 = "<1.27" [[package]] name = "types-setuptools" -version = "63.2.2" +version = "63.2.3" description = "Typing stubs for setuptools" category = "dev" optional = false @@ -1228,7 +1228,7 @@ python-versions = "*" [[package]] name = "types-urllib3" -version = "1.26.21" +version = "1.26.22" description = "Typing stubs for urllib3" category = "dev" optional = false @@ -1400,7 +1400,7 @@ misp = ["python-magic", "pydeep2"] [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "e610c176898413ec199ccfaa7f8a0afbfd6b1603dc869dad2d9662d83ba52c52" +content-hash = "c5dbbe94d98ae4aaf7dd387149e2bad53ac098e8b6e951b2b87a4aacad4dc3be" [metadata.files] aiohttp = [ @@ -1762,8 +1762,8 @@ gunicorn = [ {file = "gunicorn-20.1.0.tar.gz", hash = "sha256:e0a968b5ba15f8a328fdfd7ab1fcb5af4470c28aaf7e55df02a99bc13138e6e8"}, ] har2tree = [ - {file = "har2tree-1.13.3-py3-none-any.whl", hash = "sha256:1cb921d3dfd9048244edcfc1aa01e5f0884248bf0cc39aafa18165eaed393db9"}, - {file = "har2tree-1.13.3.tar.gz", hash = "sha256:7481696e9c4f2907d4df391027e4c4ee01b03f4a588a726dc1904035c0d959af"}, + {file = "har2tree-1.13.4-py3-none-any.whl", hash = "sha256:08b8758718215a7094f7784f3c98e336bbc567c4c3cd3ef8a96c9e136f9d1821"}, + {file = "har2tree-1.13.4.tar.gz", hash = "sha256:767cd9c6c28b6486ce1dad956f6041e37f939ca9842729be83bcd2df1df910c1"}, ] hiredis = [ {file = "hiredis-2.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b4c8b0bc5841e578d5fb32a16e0c305359b987b850a06964bd5a62739d688048"}, @@ -2446,12 +2446,12 @@ types-requests = [ {file = "types_requests-2.28.7-py3-none-any.whl", hash = "sha256:38015d310d13cf7d4d712d2507178349e13fd5dab85259dab7d9a9884c2c9c2a"}, ] types-setuptools = [ - {file = "types-setuptools-63.2.2.tar.gz", hash = "sha256:a9aa0c01d5f3443cd544026d5ffc97b95ddadf731dab13419c393d43fd8617c0"}, - {file = "types_setuptools-63.2.2-py3-none-any.whl", hash = "sha256:a370df7a1e0dc856af9d998234f6e2ab04f30f25b8e1410f6db65910979f6252"}, + {file = "types-setuptools-63.2.3.tar.gz", hash = "sha256:c0f5d452976e390527276fc4a6aa97cc1a38f4edb904f787441e70a472c37d44"}, + {file = "types_setuptools-63.2.3-py3-none-any.whl", hash = "sha256:58f4b41db8b0dbd8222514872e48cbe4b3cb5a90e1ff4b6733e94fe83aaf0f79"}, ] types-urllib3 = [ - {file = "types-urllib3-1.26.21.tar.gz", hash = "sha256:2f7fb7ae6a1884241e588c7becd5f005ce1e03f6eface8a3f65e378c2adf9516"}, - {file = "types_urllib3-1.26.21-py3-none-any.whl", hash = "sha256:2f960d8681002a37385263c372882fd12c676e10b127553738a2b6064e4438d1"}, + {file = "types-urllib3-1.26.22.tar.gz", hash = "sha256:b05af90e73889e688094008a97ca95788db8bf3736e2776fd43fb6b171485d94"}, + {file = "types_urllib3-1.26.22-py3-none-any.whl", hash = "sha256:09a8783e1002472e8d1e1f3792d4c5cca1fffebb9b48ee1512aae6d16fe186bc"}, ] types-werkzeug = [ {file = "types-Werkzeug-1.0.9.tar.gz", hash = "sha256:5cc269604c400133d452a40cee6397655f878fc460e03fde291b9e3a5eaa518c"}, diff --git a/pyproject.toml b/pyproject.toml index 0360101..9bda2b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,7 @@ pyhashlookup = "^1.2.0" lief = "^0.12.1" ua-parser = "^0.15.0" Flask-Login = "^0.6.2" -har2tree = "^1.13.3" +har2tree = "^1.13.4" playwrightcapture = "^1.13.4" passivetotal = "^2.5.9" werkzeug = "2.1.2" @@ -81,7 +81,7 @@ types-pkg-resources = "^0.1.3" types-Deprecated = "^1.2.9" types-python-dateutil = "^2.8.19" types-beautifulsoup4 = "^4.11.4" -types-setuptools = "^63.2.2" +types-setuptools = "^63.2.3" types-Pillow = "^9.2.1" [build-system] diff --git a/website/web/__init__.py b/website/web/__init__.py index 724b1e8..da23961 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -6,6 +6,7 @@ import json import logging import os import time + from datetime import date, datetime, timedelta, timezone from io import BytesIO, StringIO from typing import Any, Dict, List, Optional, Union, TypedDict @@ -844,7 +845,11 @@ def capture_web(): else: user = src_request_ip(request) - if request.method == 'POST' and (request.form.get('url') or request.form.get('urls')): + if request.method == 'POST': + if not (request.form.get('url') or request.form.get('urls') or 'document' in request.files): + flash('Invalid submission: please submit at least a URL or a document.', 'error') + return _prepare_capture_template(user_ua=request.headers.get('User-Agent')) + capture_query: Dict[str, Union[str, bytes, int, bool]] = {} # check if the post request has the file part if 'cookies' in request.files and request.files['cookies'].filename: @@ -890,7 +895,7 @@ def capture_web(): perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated) time.sleep(2) return redirect(url_for('tree', tree_uuid=perma_uuid)) - else: + elif request.form.get('urls'): # bulk query bulk_captures = [] for url in request.form['urls'].split('\n'): @@ -900,6 +905,13 @@ def capture_web(): bulk_captures.append((new_capture_uuid, url)) return render_template('bulk_captures.html', bulk_captures=bulk_captures) + elif 'document' in request.files: + # File upload + capture_query['document'] = request.files['document'].stream.read() + capture_query['document_name'] = request.files['document'].filename + perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated) + time.sleep(2) + return redirect(url_for('tree', tree_uuid=perma_uuid)) elif request.method == 'GET' and request.args.get('url'): url = unquote_plus(request.args['url']).strip() capture_query = {'url': url} diff --git a/website/web/genericapi.py b/website/web/genericapi.py index 040e323..d037505 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -326,7 +326,9 @@ class CaptureCookies(Resource): # Just text submit_fields_post = api.model('SubmitFieldsPost', { - 'url': fields.Url(description="The URL to capture", required=True), + 'url': fields.Url(description="The URL to capture"), + 'document': fields.String(description="A base64 encoded document, it can be anything a browser can display."), + 'document_name': fields.String(description="The name of the document."), 'listing': fields.Integer(description="Display the capture on the index", min=0, max=1, example=1), 'user_agent': fields.String(description="User agent to use for the capture", example=''), 'referer': fields.String(description="Referer to pass to the capture", example=''), @@ -376,6 +378,8 @@ class SubmitCapture(Resource): else: user = src_request_ip(request) to_query: Dict = request.get_json(force=True) + if 'document' in to_query: + to_query['document'] = base64.b64decode(to_query['document']) perma_uuid = lookyloo.enqueue_capture(to_query, source='api', user=user, authenticated=flask_login.current_user.is_authenticated) return perma_uuid diff --git a/website/web/templates/capture.html b/website/web/templates/capture.html index b99ad0d..9c235f0 100644 --- a/website/web/templates/capture.html +++ b/website/web/templates/capture.html @@ -39,20 +39,45 @@ -
- - - - -
- - + + + +