new: Upload a file instead of submitting a URL.

2022-08-04 16:58:07 +02:00 · 2022-08-04 16:58:07 +02:00 · 72c4e43474
parent 2ce8b5a96c
commit 72c4e43474
7 changed files with 129 additions and 2776 deletions
--- a/bin/async_capture.py
+++ b/bin/async_capture.py
@ -4,10 +4,13 @@ import asyncio
 import ipaddress
 import json
 import logging
+import os
 import socket
+
 from datetime import datetime
 from io import BufferedIOBase
 from pathlib import Path
+from tempfile import NamedTemporaryFile
 from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import urlsplit

@ -37,18 +40,18 @@ class AsyncCapture(AbstractManager):
        if not self.fox.available:
            self.logger.warning('Unable to setup the FOX module')

-    def thirdparty_submit(self, capture_data: Dict[str, str]) -> None:
+    def thirdparty_submit(self, url: str) -> None:
        if self.fox.available:
-            self.fox.capture_default_trigger(capture_data['url'], auto_trigger=True)
+            self.fox.capture_default_trigger(url, auto_trigger=True)

    async def process_capture_queue(self) -> None:
        '''Process a query from the capture queue'''
-        value: List[Tuple[str, float]] = await self.redis.zpopmax('to_capture')
+        value: List[Tuple[bytes, float]] = await self.redis.zpopmax('to_capture')
        if not value or not value[0]:
            # The queue was consumed by an other process.
            return
-        uuid, _score = value[0]
-        queue: Optional[str] = await self.redis.get(f'{uuid}_mgmt')
+        uuid = value[0][0].decode()
+        queue: Optional[bytes] = await self.redis.get(f'{uuid}_mgmt')
        await self.redis.sadd('ongoing', uuid)

        async with self.redis.pipeline() as lazy_cleanup:
@ -57,55 +60,70 @@ class AsyncCapture(AbstractManager):
                # queue shouldn't be none, but if it is, just ignore.
                await lazy_cleanup.zincrby('queues', -1, queue)

-            to_capture: Dict[str, str] = await self.redis.hgetall(uuid)
+            to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid)

            if get_config('generic', 'default_public'):
                # By default, the captures are on the index, unless the user mark them as un-listed
-                listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True
+                listing = False if ('listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True
            else:
                # By default, the captures are not on the index, unless the user mark them as listed
-                listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False
+                listing = True if ('listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False

            # Turn the freetext for the headers into a dict
-            headers = {}
-            if 'headers' in to_capture:
-                for header_line in to_capture['headers'].splitlines():
+            headers: Dict[str, str] = {}
+            if b'headers' in to_capture:
+                for header_line in to_capture[b'headers'].decode().splitlines():
                    if header_line and ':' in header_line:
                        splitted = header_line.split(':', 1)
                        if splitted and len(splitted) == 2:
                            header, h_value = splitted
                            if header and h_value:
                                headers[header.strip()] = h_value.strip()
-            if to_capture.get('dnt'):
-                headers['DNT'] = to_capture['dnt']
+            if to_capture.get(b'dnt'):
+                headers['DNT'] = to_capture[b'dnt'].decode()

-            self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
-            self.thirdparty_submit(to_capture)
-            success, error_message = await self._capture(
-                to_capture['url'],
-                perma_uuid=uuid,
-                cookies_pseudofile=to_capture.get('cookies', None),
-                listing=listing,
-                user_agent=to_capture.get('user_agent', None),
-                referer=to_capture.get('referer', None),
-                headers=headers if headers else None,
-                proxy=to_capture.get('proxy', None),
-                os=to_capture.get('os', None),
-                browser=to_capture.get('browser', None),
-                parent=to_capture.get('parent', None)
-            )
-            if success:
-                self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}')
+            if to_capture.get(b'document'):
+                # we do not have a URL yet.
+                document_name = Path(to_capture[b'document_name'].decode()).name
+                tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
+                with open(tmp_f.name, "wb") as f:
+                    f.write(to_capture[b'document'])
+                url = f'file://{tmp_f.name}'
            else:
-                self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}')
-                await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}')
+                url = to_capture[b'url'].decode()
+                self.thirdparty_submit(url)
+
+            self.logger.info(f'Capturing {url} - {uuid}')
+            success, error_message = await self._capture(
+                url,
+                perma_uuid=uuid,
+                cookies_pseudofile=to_capture.get(b'cookies', None),
+                listing=listing,
+                user_agent=to_capture[b'user_agent'].decode() if to_capture.get(b'user_agent') else None,
+                referer=to_capture[b'referer'].decode() if to_capture.get(b'referer') else None,
+                headers=headers if headers else None,
+                proxy=to_capture[b'proxy'].decode() if to_capture.get(b'proxy') else None,
+                os=to_capture[b'os'].decode() if to_capture.get(b'os') else None,
+                browser=to_capture[b'browser'].decode() if to_capture.get(b'browser') else None,
+                parent=to_capture[b'parent'].decode() if to_capture.get(b'parent') else None
+            )
+
+            if to_capture.get(b'document'):
+                os.unlink(tmp_f.name)
+
+            if success:
+                self.logger.info(f'Successfully captured {url} - {uuid}')
+            else:
+                self.logger.warning(f'Unable to capture {url} - {uuid}: {error_message}')
+                await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}')
            await lazy_cleanup.srem('ongoing', uuid)
            await lazy_cleanup.delete(uuid)
            # make sure to expire the key if nothing was processed for a while (= queues empty)
            await lazy_cleanup.expire('queues', 600)
            await lazy_cleanup.execute()

-    async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
+    async def _capture(self, url: str, *, perma_uuid: str,
+                       cookies_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None,
                       listing: bool=True, user_agent: Optional[str]=None,
                       referer: Optional[str]=None,
                       headers: Optional[Dict[str, str]]=None,
@ -114,7 +132,7 @@ class AsyncCapture(AbstractManager):
        '''Launch a capture'''
        url = url.strip()
        url = refang(url)
-        if not url.startswith('http'):
+        if not url.startswith('data') and not url.startswith('http') and not url.startswith('file'):
            url = f'http://{url}'
        splitted_url = urlsplit(url)
        if self.only_global_lookups:
@ -187,11 +205,11 @@ class AsyncCapture(AbstractManager):
                _parent.write(parent)

        if 'downloaded_filename' in entries and entries['downloaded_filename']:
-            with(dirpath / '0.data.filename').open('w') as _downloaded_filename:
+            with (dirpath / '0.data.filename').open('w') as _downloaded_filename:
                _downloaded_filename.write(entries['downloaded_filename'])

        if 'downloaded_file' in entries and entries['downloaded_file']:
-            with(dirpath / '0.data').open('wb') as _downloaded_file:
+            with (dirpath / '0.data').open('wb') as _downloaded_file:
                _downloaded_file.write(entries['downloaded_file'])

        if 'error' in entries:
@ -223,7 +241,7 @@ class AsyncCapture(AbstractManager):
        return True, 'All good!'

    async def _to_run_forever_async(self):
-        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
+        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'))
        while await self.redis.exists('to_capture'):
            await self.process_capture_queue()
            if self.shutdown_requested():
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -129,14 +129,14 @@ def load_known_content(directory: str='known_content') -> Dict[str, Dict[str, An
    return to_return


-def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) -> List[Dict[str, Union[str, bool]]]:
+def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None) -> List[Dict[str, Union[str, bool]]]:
    cookies: List[Dict[str, Union[str, bool]]]
    if cookie_pseudofile:
-        if isinstance(cookie_pseudofile, str):
+        if isinstance(cookie_pseudofile, (str, bytes)):
            try:
                cookies = json.loads(cookie_pseudofile)
            except json.decoder.JSONDecodeError:
-                logger.warning(f'Unable to load json content: {cookie_pseudofile}')
+                logger.warning(f'Unable to load json content: {cookie_pseudofile!r}')
                return []
        else:
            # Note: we might have an empty BytesIO, which is not False.
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -5,6 +5,7 @@ import hashlib
 import json
 import logging
 import operator
+import pickle
 import smtplib

 from collections import defaultdict
@ -398,11 +399,9 @@ class Lookyloo():
                query[key] = 1 if value else 0
            elif isinstance(value, (list, dict)):
                query[key] = json.dumps(value)
-            elif isinstance(value, bytes):
-                query[key] = value.decode()

        # dirty deduplicate
-        hash_query = hashlib.sha512(json.dumps(query).encode()).hexdigest()
+        hash_query = hashlib.sha512(pickle.dumps(query)).hexdigest()
        # FIXME The line below should work, but it doesn't
        # if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)):
        if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')):
--- a/poetry.lock
+++ b/poetry.lock
--- a/website/web/init.py
+++ b/website/web/init.py
@ -6,7 +6,9 @@ import json
 import logging
 import os
 import time
-import filetype
+
+import filetype  # type: ignore
+
 from datetime import date, datetime, timedelta, timezone
 from io import BytesIO, StringIO
 from typing import Any, Dict, List, Optional, Union, TypedDict
@ -489,15 +491,19 @@ def image(tree_uuid: str):
    return send_file(to_return, mimetype='image/png',
                     as_attachment=True, attachment_filename='image.png')

+
@app.route('/tree/<string:tree_uuid>/data', methods=['GET'])
 def data(tree_uuid: str):
    filename, data = lookyloo.get_data(tree_uuid)
-    if len(filename) != 0:
+    if len(filename) == 0:
+        # TODO: return something saying it is not a valid request
+        return
+
    if filetype.guess_mime(data.getvalue()) is None:
        mime = 'application/octet-stream'
    else:
        mime = filetype.guess_mime(data.getvalue())
-        return send_file(data, mimetype= mime,
+    return send_file(data, mimetype=mime,
                     as_attachment=True, attachment_filename=filename)


@ -856,7 +862,11 @@ def capture_web():
    else:
        user = src_request_ip(request)

-    if request.method == 'POST' and (request.form.get('url') or request.form.get('urls')):
+    if request.method == 'POST':
+        if not (request.form.get('url') or request.form.get('urls') or 'document' in request.files):
+            flash('Invalid submission: please submit at least a URL or a document.', 'error')
+            return _prepare_capture_template(user_ua=request.headers.get('User-Agent'))
+
        capture_query: Dict[str, Union[str, bytes, int, bool]] = {}
        # check if the post request has the file part
        if 'cookies' in request.files and request.files['cookies'].filename:
@ -902,7 +912,7 @@ def capture_web():
            perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
            time.sleep(2)
            return redirect(url_for('tree', tree_uuid=perma_uuid))
-        else:
+        elif request.form.get('urls'):
            # bulk query
            bulk_captures = []
            for url in request.form['urls'].split('\n'):
@ -912,6 +922,13 @@ def capture_web():
                bulk_captures.append((new_capture_uuid, url))

            return render_template('bulk_captures.html', bulk_captures=bulk_captures)
+        elif 'document' in request.files:
+            # File upload
+            capture_query['document'] = request.files['document'].stream.read()
+            capture_query['document_name'] = request.files['document'].filename
+            perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
+            time.sleep(2)
+            return redirect(url_for('tree', tree_uuid=perma_uuid))
    elif request.method == 'GET' and request.args.get('url'):
        url = unquote_plus(request.args['url']).strip()
        capture_query = {'url': url}
--- a/website/web/genericapi.py
+++ b/website/web/genericapi.py
@ -326,7 +326,9 @@ class CaptureCookies(Resource):
 # Just text

 submit_fields_post = api.model('SubmitFieldsPost', {
-    'url': fields.Url(description="The URL to capture", required=True),
+    'url': fields.Url(description="The URL to capture"),
+    'document': fields.String(description="A base64 encoded document, it can be anything a browser can display."),
+    'document_name': fields.String(description="The name of the document."),
    'listing': fields.Integer(description="Display the capture on the index", min=0, max=1, example=1),
    'user_agent': fields.String(description="User agent to use for the capture", example=''),
    'referer': fields.String(description="Referer to pass to the capture", example=''),
@ -376,6 +378,8 @@ class SubmitCapture(Resource):
        else:
            user = src_request_ip(request)
        to_query: Dict = request.get_json(force=True)
+        if 'document' in to_query:
+            to_query['document'] = base64.b64decode(to_query['document'])
        perma_uuid = lookyloo.enqueue_capture(to_query, source='api', user=user, authenticated=flask_login.current_user.is_authenticated)
        return perma_uuid

--- a/website/web/templates/capture.html
+++ b/website/web/templates/capture.html
@ -39,12 +39,24 @@
      </div>
    </div>

+    <nav>
+      <div class="nav nav-tabs" id="submission-type" role="tablist">
+        <button class="nav-link active" id="nav-url-tab" data-bs-toggle="tab" data-bs-target="#nav-url" type="button" role="tab" aria-current="nav-url" aria-selected="true" href="#">URL(s)</button>
+        <button class="nav-link" id="nav-doc-tab" data-bs-toggle="tab" data-bs-target="#nav-doc" type="button" role="tab" aria-current="nav-doc" aria-selected="false" href="#">Document</button>
+      </div>
+    </nav>
+
+    <div class="tab-content" id="nav-tabContent">
+      </br>
+      <div class="tab-pane fade show active" id="nav-url" role="tabpanel" aria-labelledby="nav-url-tab">
        <div class="row input-group mb-3">
          <label for="url" class="col-sm-1 col-form-label">URL:</label>
          <input type="text" class="form-control col-auto" name="url" id=singleCaptureField
-             placeholder="URL to capture" value="{{predefined_url_to_capture}}" required>
+                 placeholder="URL to capture" value="{{predefined_url_to_capture}}">
+
          <textarea class="form-control col-auto d-none" placeholder="URLs to capture, one per line"
                    name="urls" id=multipleCapturesField></textarea>
+
          <span class="col-sm-2 input-group-text">
            <div class="form-check">
              <input class="form-check-input" name="multipleCaptures" id="multipleCaptures" type="checkbox"
@ -53,6 +65,19 @@
            </div>
          </span>
        </div>
+      </div>
+
+      <div class="tab-pane fade" id="nav-doc" role="tabpanel" aria-labelledby="nav-doc-tab">
+        <div class="row mb-3">
+          <label for="document" class="col-sm-1 col-form-label">Document:</label>
+          <div class="col-sm-10">
+            <input type="file" class="form-control-file" id="document" name="document">
+            <div>Instead of a URL, you can upload a file. Preferably an HTML document, but it can be anything supported by a browser.</div>
+          </div>
+        </div>
+      </div>
+    </div>
+    <div class="dropdown-divider"></div>

    <div>
      <button class="btn btn-link" type="button" data-bs-toggle="collapse" data-bs-target="#collapseConfigBrowser"
@ -228,16 +253,12 @@
        if (document.getElementById('multipleCaptures').checked == true) {
            document.getElementById('singleCaptureField').value = '';
            $("#singleCaptureField").addClass("d-none");
-            $("#singleCaptureField").removeAttr("required");
            $("#multipleCapturesField").removeClass("d-none");
-            $("#multipleCapturesField").attr("required", true);
        }
        else {
            document.getElementById('multipleCapturesField').value = '';
            $("#singleCaptureField").removeClass("d-none");
-            $("#singleCaptureField").attr("required", true);
            $("#multipleCapturesField").addClass("d-none");
-            $("#multipleCapturesField").removeAttr("required");
        }
      })
  </script>