new: Add support for playwright devices and browser name (API only)

pull/497/head
Raphaël Vinot 2022-08-18 11:19:32 +02:00
parent 1c8ae0db52
commit 998ef12b06
5 changed files with 112 additions and 81 deletions

View File

@ -16,7 +16,7 @@ from urllib.parse import urlsplit
from defang import refang # type: ignore
from redis.asyncio import Redis
from playwrightcapture import Capture
from playwrightcapture import Capture, PlaywrightCaptureException
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
from lookyloo.helpers import get_captures_dir, load_cookies, UserAgents
@ -50,53 +50,46 @@ class AsyncCapture(AbstractManager):
if not value or not value[0]:
# The queue was consumed by an other process.
return
uuid = value[0][0].decode()
queue: Optional[bytes] = await self.redis.get(f'{uuid}_mgmt')
uuid: str = value[0][0].decode()
queue: Optional[bytes] = await self.redis.getdel(f'{uuid}_mgmt')
await self.redis.sadd('ongoing', uuid)
async with self.redis.pipeline() as lazy_cleanup:
await lazy_cleanup.delete(f'{uuid}_mgmt')
if queue:
# queue shouldn't be none, but if it is, just ignore.
await lazy_cleanup.zincrby('queues', -1, queue)
to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid)
to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid)
if get_config('generic', 'default_public'):
# By default, the captures are on the index, unless the user mark them as un-listed
listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True
else:
# By default, the captures are not on the index, unless the user mark them as listed
listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False
if get_config('generic', 'default_public'):
# By default, the captures are on the index, unless the user mark them as un-listed
listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True
else:
# By default, the captures are not on the index, unless the user mark them as listed
listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False
# Turn the freetext for the headers into a dict
headers: Dict[str, str] = {}
if b'headers' in to_capture:
for header_line in to_capture[b'headers'].decode().splitlines():
if header_line and ':' in header_line:
splitted = header_line.split(':', 1)
if splitted and len(splitted) == 2:
header, h_value = splitted
if header and h_value:
headers[header.strip()] = h_value.strip()
if to_capture.get(b'dnt'):
headers['DNT'] = to_capture[b'dnt'].decode()
# Turn the freetext for the headers into a dict
headers: Dict[str, str] = {}
if b'headers' in to_capture:
for header_line in to_capture[b'headers'].decode().splitlines():
if header_line and ':' in header_line:
splitted = header_line.split(':', 1)
if splitted and len(splitted) == 2:
header, h_value = splitted
if header and h_value:
headers[header.strip()] = h_value.strip()
if to_capture.get(b'dnt'):
headers['DNT'] = to_capture[b'dnt'].decode()
if to_capture.get(b'document'):
# we do not have a URL yet.
document_name = Path(to_capture[b'document_name'].decode()).name
tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
with open(tmp_f.name, "wb") as f:
f.write(to_capture[b'document'])
url = f'file://{tmp_f.name}'
elif to_capture.get(b'url'):
url = to_capture[b'url'].decode()
self.thirdparty_submit(url)
else:
self.logger.warning(f'Invalid capture {to_capture}.')
await lazy_cleanup.execute()
return
if to_capture.get(b'document'):
# we do not have a URL yet.
document_name = Path(to_capture[b'document_name'].decode()).name
tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
with open(tmp_f.name, "wb") as f:
f.write(to_capture[b'document'])
url = f'file://{tmp_f.name}'
elif to_capture.get(b'url'):
url = to_capture[b'url'].decode()
self.thirdparty_submit(url)
else:
self.logger.warning(f'Invalid capture {to_capture}.')
if url:
self.logger.info(f'Capturing {url} - {uuid}')
success, error_message = await self._capture(
url,
@ -109,6 +102,8 @@ class AsyncCapture(AbstractManager):
proxy=to_capture[b'proxy'].decode() if to_capture.get(b'proxy') else None,
os=to_capture[b'os'].decode() if to_capture.get(b'os') else None,
browser=to_capture[b'browser'].decode() if to_capture.get(b'browser') else None,
browser_engine=to_capture[b'browser_engine'].decode() if to_capture.get(b'browser_engine') else None,
device_name=to_capture[b'device_name'].decode() if to_capture.get(b'device_name') else None,
parent=to_capture[b'parent'].decode() if to_capture.get(b'parent') else None
)
@ -119,7 +114,11 @@ class AsyncCapture(AbstractManager):
self.logger.info(f'Successfully captured {url} - {uuid}')
else:
self.logger.warning(f'Unable to capture {url} - {uuid}: {error_message}')
await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}')
await self.redis.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}')
async with self.redis.pipeline() as lazy_cleanup:
if queue and await self.redis.zscore('queues', queue):
await lazy_cleanup.zincrby('queues', -1, queue)
await lazy_cleanup.srem('ongoing', uuid)
await lazy_cleanup.delete(uuid)
# make sure to expire the key if nothing was processed for a while (= queues empty)
@ -132,7 +131,10 @@ class AsyncCapture(AbstractManager):
referer: Optional[str]=None,
headers: Optional[Dict[str, str]]=None,
proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
browser: Optional[str]=None, parent: Optional[str]=None,
browser_engine: Optional[str]=None,
device_name: Optional[str]=None,
viewport: Optional[Dict[str, int]]=None) -> Tuple[bool, str]:
'''Launch a capture'''
url = url.strip()
url = refang(url)
@ -159,23 +161,29 @@ class AsyncCapture(AbstractManager):
and splitted_url.hostname.split('.')[-1] == 'onion'):
proxy = get_config('generic', 'tor_proxy')
cookies = load_cookies(cookies_pseudofile)
if not user_agent:
# Catch case where the UA is broken on the UI, and the async submission.
self.user_agents.user_agents # triggers an update if needed
ua: str = self.user_agents.default['useragent']
else:
ua = user_agent
self.user_agents.user_agents # triggers an update of the default UAs
self.logger.info(f'Capturing {url}')
try:
async with Capture(proxy=proxy) as capture:
capture.prepare_cookies(cookies)
capture.user_agent = ua
async with Capture(browser=browser_engine, device_name=device_name, proxy=proxy) as capture:
if headers:
capture.http_headers = headers
await capture.prepare_context()
capture.headers = headers
if cookies_pseudofile:
# required by Mypy: https://github.com/python/mypy/issues/3004
capture.cookies = load_cookies(cookies_pseudofile) # type: ignore
if viewport:
# required by Mypy: https://github.com/python/mypy/issues/3004
capture.viewport = viewport # type: ignore
if not device_name:
capture.user_agent = user_agent if user_agent else self.user_agents.default['useragent']
await capture.initialize_context()
entries = await capture.capture_page(url, referer=referer)
except PlaywrightCaptureException as e:
self.logger.exception(f'Invalid parameters for the capture of {url} - {e}')
return False, 'Invalid parameters for the capture of {url} - {e}'
except Exception as e:
self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}')
return False, f'Something went terribly wrong when capturing {url}.'

View File

@ -21,6 +21,7 @@ from zipfile import ZipFile
from defang import defang # type: ignore
from har2tree import CrawledTree, HostNode, URLNode
from PIL import Image, UnidentifiedImageError
from playwrightcapture import get_devices
from pymisp import MISPAttribute, MISPEvent, MISPObject
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection
@ -874,6 +875,9 @@ class Lookyloo():
ct = self.get_crawled_tree(tree_uuid)
return {node.name for node in ct.root_hartree.url_tree.traverse()}
def get_playwright_devices(self):
return get_devices()
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
'''Gather all the informations needed to display the Hostnode investigator popup.'''

53
poetry.lock generated
View File

@ -50,7 +50,7 @@ python-versions = "*"
[[package]]
name = "asttokens"
version = "2.0.7"
version = "2.0.8"
description = "Annotate AST trees with source code positions"
category = "dev"
optional = false
@ -544,7 +544,7 @@ i18n = ["Babel (>=2.7)"]
[[package]]
name = "jsonschema"
version = "4.9.1"
version = "4.10.3"
description = "An implementation of JSON Schema validation for Python"
category = "main"
optional = false
@ -592,7 +592,7 @@ python-versions = ">=3.7"
[[package]]
name = "matplotlib-inline"
version = "0.1.3"
version = "0.1.6"
description = "Inline Matplotlib backend for Jupyter"
category = "dev"
optional = false
@ -724,7 +724,7 @@ python-versions = ">=3.6"
[[package]]
name = "playwright"
version = "1.24.1"
version = "1.25.1"
description = "A high-level API to automate web browsers"
category = "main"
optional = false
@ -738,7 +738,7 @@ websockets = "10.1"
[[package]]
name = "playwrightcapture"
version = "1.14.0"
version = "1.14.1"
description = "A simple library to capture websites using playwright"
category = "main"
optional = false
@ -746,7 +746,7 @@ python-versions = ">=3.8,<4.0"
[package.dependencies]
dateparser = ">=1.1.1,<2.0.0"
playwright = ">=1.24.1,<2.0.0"
playwright = ">=1.25.1,<2.0.0"
[package.extras]
recaptcha = ["requests (>=2.28.1,<3.0.0)", "pydub (>=0.25.1,<0.26.0)", "SpeechRecognition (>=3.8.1,<4.0.0)"]
@ -838,12 +838,15 @@ python-versions = "*"
[[package]]
name = "pygments"
version = "2.12.0"
version = "2.13.0"
description = "Pygments is a syntax highlighting package written in Python."
category = "main"
optional = false
python-versions = ">=3.6"
[package.extras]
plugins = ["importlib-metadata"]
[[package]]
name = "pyhashlookup"
version = "1.2.0"
@ -1410,7 +1413,7 @@ misp = ["python-magic", "pydeep2"]
[metadata]
lock-version = "1.1"
python-versions = ">=3.8,<3.11"
content-hash = "ce26fadedd7a02ea21bfb8bcf8394e5f24e26e9033516cbd5c608ecaa954b95b"
content-hash = "d6422ef6314227f2b23aa623853c3b957e28a01da9a070cdfbd4a80fd4559d6f"
[metadata.files]
aiohttp = [
@ -1500,8 +1503,8 @@ appnope = [
{file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"},
]
asttokens = [
{file = "asttokens-2.0.7-py2.py3-none-any.whl", hash = "sha256:f5589ef8518f73dd82c15e1c19f795d8a62c133485e557c04443d4a1a730cf9f"},
{file = "asttokens-2.0.7.tar.gz", hash = "sha256:8444353e4e2a99661c8dfb85ec9c02eedded08f0006234bff7db44a06840acc2"},
{file = "asttokens-2.0.8-py2.py3-none-any.whl", hash = "sha256:e3305297c744ae53ffa032c45dc347286165e4ffce6875dc662b205db0623d86"},
{file = "asttokens-2.0.8.tar.gz", hash = "sha256:c61e16246ecfb2cde2958406b4c8ebc043c9e6d73aaa83c941673b35e5d3a76b"},
]
async-timeout = [
{file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
@ -1847,8 +1850,8 @@ jinja2 = [
{file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
]
jsonschema = [
{file = "jsonschema-4.9.1-py3-none-any.whl", hash = "sha256:8ebad55894c002585271af2d327d99339ef566fb085d9129b69e2623867c4106"},
{file = "jsonschema-4.9.1.tar.gz", hash = "sha256:408c4c8ed0dede3b268f7a441784f74206380b04f93eb2d537c7befb3df3099f"},
{file = "jsonschema-4.10.3-py3-none-any.whl", hash = "sha256:443442f9ac2fdfde7bc99079f0ba08e5d167fc67749e9fc706a393bc8857ca48"},
{file = "jsonschema-4.10.3.tar.gz", hash = "sha256:59ad13764820eb9d2cafc6db32e92fabd318c1e4e3f2205e646225283704a2c3"},
]
lief = [
{file = "lief-0.12.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fbbc9d520de87ac22210c62d22a9b088e5460f9a028741311e6f68ef8877ddd"},
@ -1992,8 +1995,8 @@ markupsafe = [
{file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
]
matplotlib-inline = [
{file = "matplotlib-inline-0.1.3.tar.gz", hash = "sha256:a04bfba22e0d1395479f866853ec1ee28eea1485c1d69a6faf00dc3e24ff34ee"},
{file = "matplotlib_inline-0.1.3-py3-none-any.whl", hash = "sha256:aed605ba3b72462d64d475a21a9296f400a19c4f74a31b59103d2a99ffd5aa5c"},
{file = "matplotlib-inline-0.1.6.tar.gz", hash = "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"},
{file = "matplotlib_inline-0.1.6-py3-none-any.whl", hash = "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311"},
]
multidict = [
{file = "multidict-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b9e95a740109c6047602f4db4da9949e6c5945cefbad34a1299775ddc9a62e2"},
@ -2200,17 +2203,17 @@ pkgutil-resolve-name = [
{file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"},
]
playwright = [
{file = "playwright-1.24.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:04575c6682098a2f5e851a3fb04bc0e3269af6e00732798103a02f63e4409dfb"},
{file = "playwright-1.24.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:279c8f150662292814499b61ccbc35917f5041592ede1853e2a46e59456500df"},
{file = "playwright-1.24.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:e0821859b625ad3dd1c89c9359d5d10d48ce1af81dc99d702437721ab72cd150"},
{file = "playwright-1.24.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:f6add5cc97a8b3bca775c38ba3382fe73004d59adc9e404bf59f23f35442ede7"},
{file = "playwright-1.24.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3f2acda4c22e78c94a47eb5b6f77f20c0313ec76f8e19d77c09bee1c44631d9"},
{file = "playwright-1.24.1-py3-none-win32.whl", hash = "sha256:5ad16b26f4ae2d539c580273bd323907e24a125dbf0f38460466608d1cb39c83"},
{file = "playwright-1.24.1-py3-none-win_amd64.whl", hash = "sha256:7355d00cd2cb24265779b1cd9b18d9dcac17eea4a558d57ce20df650fc766f53"},
{file = "playwright-1.25.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:1dbe89f4e3dae53add2c6b642cd07c44474eaba88593e29be7ae82106ede8e63"},
{file = "playwright-1.25.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58641991bcf43ade2a0740ece6e9d22deff228a6358f9aa61a290b7c4ab6f6ab"},
{file = "playwright-1.25.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:426f2e839671b6fe803a87ce3c7b38a8b3c552565863700791238a97f5f1ad24"},
{file = "playwright-1.25.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:25b7ca2ee2bdf668dc487563355f42fc354bf5a386eaf639ace44133af7c7ab3"},
{file = "playwright-1.25.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de9cd487b28e7d03eb04ab8f8e23bfa75c18dffc897396dffa8e9f1be0982d22"},
{file = "playwright-1.25.1-py3-none-win32.whl", hash = "sha256:ca66ec55858fddfb0255a35c4c320795178b69424a51f95fe09530fed71e9abf"},
{file = "playwright-1.25.1-py3-none-win_amd64.whl", hash = "sha256:d5c64d4b6f4ab56ea0acf5446f3aa3834beea8d871c58a49eff189aa3cf85d13"},
]
playwrightcapture = [
{file = "PlaywrightCapture-1.14.0-py3-none-any.whl", hash = "sha256:490df4f16f057c2b1c169aaf037d5906981c1ab2d545b17fe54d89be61b61436"},
{file = "PlaywrightCapture-1.14.0.tar.gz", hash = "sha256:22e01bbb41581e7ead3a783177fead523b216030b609a19313223381468e11fb"},
{file = "PlaywrightCapture-1.14.1-py3-none-any.whl", hash = "sha256:9f83f65c3842825a15b05266aa254bffc598aa53727f57b00bdf3f947418fde1"},
{file = "PlaywrightCapture-1.14.1.tar.gz", hash = "sha256:9dbffb9336a7537697a90e02c454b2a17af1e4bfff281ce831445c70b7f973cd"},
]
prompt-toolkit = [
{file = "prompt_toolkit-3.0.30-py3-none-any.whl", hash = "sha256:d8916d3f62a7b67ab353a952ce4ced6a1d2587dfe9ef8ebc30dd7c386751f289"},
@ -2258,8 +2261,8 @@ pyfaup = [
{file = "pyfaup-1.2.tar.gz", hash = "sha256:5648bc3ebd80239aec927aedfc218c3a6ff36de636cc53822bfeb70b0869b1e7"},
]
pygments = [
{file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"},
{file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"},
{file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"},
{file = "Pygments-2.13.0.tar.gz", hash = "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1"},
]
pyhashlookup = [
{file = "pyhashlookup-1.2.0-py3-none-any.whl", hash = "sha256:219a16381330b9ca6d9f36f514583bc0cfdb04ff44fd6a8d5e9be18e3497979c"},

View File

@ -63,7 +63,7 @@ lief = "^0.12.1"
ua-parser = "^0.15.0"
Flask-Login = "^0.6.2"
har2tree = "^1.14.1"
playwrightcapture = "^1.14.0"
playwrightcapture = "^1.14.1"
passivetotal = "^2.5.9"
werkzeug = "2.1.2"
filetype = "^1.1.0"

View File

@ -299,6 +299,14 @@ class InstanceStats(Resource):
return lookyloo.get_stats()
@api.route('/json/devices')
@api.doc(description='Get the list of devices pre-configured on the platform')
class Devices(Resource):
def get(self):
return lookyloo.get_playwright_devices()
@api.route('/json/<string:capture_uuid>/stats')
@api.doc(description='Get the statistics of the capture.',
params={'capture_uuid': 'The UUID of the capture'})
@ -331,8 +339,10 @@ submit_fields_post = api.model('SubmitFieldsPost', {
'document_name': fields.String(description="The name of the document."),
'listing': fields.Integer(description="Display the capture on the index", min=0, max=1, example=1),
'user_agent': fields.String(description="User agent to use for the capture", example=''),
'browser_name': fields.String(description="Use this browser. Must be chromium, firefox or webkit.", example=''),
'device_name': fields.String(description="Use the pre-configured settings for this device. Get a list from /json/devices.", example=''),
'referer': fields.String(description="Referer to pass to the capture", example=''),
'headers': fields.String(description="Referer to pass to the capture", example='Accept-Language: en-US;q=0.5, fr-FR;q=0.4'),
'headers': fields.String(description="Headers to pass to the capture", example='Accept-Language: en-US;q=0.5, fr-FR;q=0.4'),
'proxy': fields.Url(description="Proxy to use for the capture. Format: [scheme]://[username]:[password]@[hostname]:[port]", example=''),
'cookies': fields.String(description="JSON export of a list of cookies as exported from an other capture", example='')
})
@ -344,6 +354,8 @@ class SubmitCapture(Resource):
@api.param('url', 'The URL to capture', required=True)
@api.param('listing', 'Display the capture on the index', default=1)
@api.param('user_agent', 'User agent to use for the capture')
@api.param('browser_name', 'Use this browser. Must be chromium, firefox or webkit.')
@api.param('device_name', 'Use the pre-configured settings for this device')
@api.param('referer', 'Referer to pass to the capture')
@api.param('proxy', 'Proxy to use for the the capture')
@api.produces(['text/text'])
@ -360,6 +372,10 @@ class SubmitCapture(Resource):
'listing': False if 'listing' in request.args and request.args['listing'] in [0, '0'] else True}
if request.args.get('user_agent'):
to_query['user_agent'] = request.args['user_agent']
if request.args.get('browser_name'):
to_query['browser_name'] = request.args['browser_name']
if request.args.get('device_name'):
to_query['device_name'] = request.args['device_name']
if request.args.get('referer'):
to_query['referer'] = request.args['referer']
if request.args.get('headers'):