From 998ef12b060be95d0f9f31a64c94779767d3acc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 18 Aug 2022 11:19:32 +0200 Subject: [PATCH] new: Add support for playwright devices and browser name (API only) --- bin/async_capture.py | 116 ++++++++++++++++++++------------------ lookyloo/lookyloo.py | 4 ++ poetry.lock | 53 +++++++++-------- pyproject.toml | 2 +- website/web/genericapi.py | 18 +++++- 5 files changed, 112 insertions(+), 81 deletions(-) diff --git a/bin/async_capture.py b/bin/async_capture.py index ebf6aa5..6bf9821 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -16,7 +16,7 @@ from urllib.parse import urlsplit from defang import refang # type: ignore from redis.asyncio import Redis -from playwrightcapture import Capture +from playwrightcapture import Capture, PlaywrightCaptureException from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir from lookyloo.helpers import get_captures_dir, load_cookies, UserAgents @@ -50,53 +50,46 @@ class AsyncCapture(AbstractManager): if not value or not value[0]: # The queue was consumed by an other process. return - uuid = value[0][0].decode() - queue: Optional[bytes] = await self.redis.get(f'{uuid}_mgmt') + uuid: str = value[0][0].decode() + queue: Optional[bytes] = await self.redis.getdel(f'{uuid}_mgmt') await self.redis.sadd('ongoing', uuid) - async with self.redis.pipeline() as lazy_cleanup: - await lazy_cleanup.delete(f'{uuid}_mgmt') - if queue: - # queue shouldn't be none, but if it is, just ignore. - await lazy_cleanup.zincrby('queues', -1, queue) + to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid) - to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid) + if get_config('generic', 'default_public'): + # By default, the captures are on the index, unless the user mark them as un-listed + listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True + else: + # By default, the captures are not on the index, unless the user mark them as listed + listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False - if get_config('generic', 'default_public'): - # By default, the captures are on the index, unless the user mark them as un-listed - listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True - else: - # By default, the captures are not on the index, unless the user mark them as listed - listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False + # Turn the freetext for the headers into a dict + headers: Dict[str, str] = {} + if b'headers' in to_capture: + for header_line in to_capture[b'headers'].decode().splitlines(): + if header_line and ':' in header_line: + splitted = header_line.split(':', 1) + if splitted and len(splitted) == 2: + header, h_value = splitted + if header and h_value: + headers[header.strip()] = h_value.strip() + if to_capture.get(b'dnt'): + headers['DNT'] = to_capture[b'dnt'].decode() - # Turn the freetext for the headers into a dict - headers: Dict[str, str] = {} - if b'headers' in to_capture: - for header_line in to_capture[b'headers'].decode().splitlines(): - if header_line and ':' in header_line: - splitted = header_line.split(':', 1) - if splitted and len(splitted) == 2: - header, h_value = splitted - if header and h_value: - headers[header.strip()] = h_value.strip() - if to_capture.get(b'dnt'): - headers['DNT'] = to_capture[b'dnt'].decode() - - if to_capture.get(b'document'): - # we do not have a URL yet. - document_name = Path(to_capture[b'document_name'].decode()).name - tmp_f = NamedTemporaryFile(suffix=document_name, delete=False) - with open(tmp_f.name, "wb") as f: - f.write(to_capture[b'document']) - url = f'file://{tmp_f.name}' - elif to_capture.get(b'url'): - url = to_capture[b'url'].decode() - self.thirdparty_submit(url) - else: - self.logger.warning(f'Invalid capture {to_capture}.') - await lazy_cleanup.execute() - return + if to_capture.get(b'document'): + # we do not have a URL yet. + document_name = Path(to_capture[b'document_name'].decode()).name + tmp_f = NamedTemporaryFile(suffix=document_name, delete=False) + with open(tmp_f.name, "wb") as f: + f.write(to_capture[b'document']) + url = f'file://{tmp_f.name}' + elif to_capture.get(b'url'): + url = to_capture[b'url'].decode() + self.thirdparty_submit(url) + else: + self.logger.warning(f'Invalid capture {to_capture}.') + if url: self.logger.info(f'Capturing {url} - {uuid}') success, error_message = await self._capture( url, @@ -109,6 +102,8 @@ class AsyncCapture(AbstractManager): proxy=to_capture[b'proxy'].decode() if to_capture.get(b'proxy') else None, os=to_capture[b'os'].decode() if to_capture.get(b'os') else None, browser=to_capture[b'browser'].decode() if to_capture.get(b'browser') else None, + browser_engine=to_capture[b'browser_engine'].decode() if to_capture.get(b'browser_engine') else None, + device_name=to_capture[b'device_name'].decode() if to_capture.get(b'device_name') else None, parent=to_capture[b'parent'].decode() if to_capture.get(b'parent') else None ) @@ -119,7 +114,11 @@ class AsyncCapture(AbstractManager): self.logger.info(f'Successfully captured {url} - {uuid}') else: self.logger.warning(f'Unable to capture {url} - {uuid}: {error_message}') - await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}') + await self.redis.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}') + + async with self.redis.pipeline() as lazy_cleanup: + if queue and await self.redis.zscore('queues', queue): + await lazy_cleanup.zincrby('queues', -1, queue) await lazy_cleanup.srem('ongoing', uuid) await lazy_cleanup.delete(uuid) # make sure to expire the key if nothing was processed for a while (= queues empty) @@ -132,7 +131,10 @@ class AsyncCapture(AbstractManager): referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None, proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None, - browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]: + browser: Optional[str]=None, parent: Optional[str]=None, + browser_engine: Optional[str]=None, + device_name: Optional[str]=None, + viewport: Optional[Dict[str, int]]=None) -> Tuple[bool, str]: '''Launch a capture''' url = url.strip() url = refang(url) @@ -159,23 +161,29 @@ class AsyncCapture(AbstractManager): and splitted_url.hostname.split('.')[-1] == 'onion'): proxy = get_config('generic', 'tor_proxy') - cookies = load_cookies(cookies_pseudofile) if not user_agent: # Catch case where the UA is broken on the UI, and the async submission. - self.user_agents.user_agents # triggers an update if needed - ua: str = self.user_agents.default['useragent'] - else: - ua = user_agent + self.user_agents.user_agents # triggers an update of the default UAs self.logger.info(f'Capturing {url}') try: - async with Capture(proxy=proxy) as capture: - capture.prepare_cookies(cookies) - capture.user_agent = ua + async with Capture(browser=browser_engine, device_name=device_name, proxy=proxy) as capture: if headers: - capture.http_headers = headers - await capture.prepare_context() + capture.headers = headers + if cookies_pseudofile: + # required by Mypy: https://github.com/python/mypy/issues/3004 + capture.cookies = load_cookies(cookies_pseudofile) # type: ignore + if viewport: + # required by Mypy: https://github.com/python/mypy/issues/3004 + capture.viewport = viewport # type: ignore + if not device_name: + capture.user_agent = user_agent if user_agent else self.user_agents.default['useragent'] + await capture.initialize_context() entries = await capture.capture_page(url, referer=referer) + except PlaywrightCaptureException as e: + self.logger.exception(f'Invalid parameters for the capture of {url} - {e}') + return False, 'Invalid parameters for the capture of {url} - {e}' + except Exception as e: self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}') return False, f'Something went terribly wrong when capturing {url}.' diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 298b2ed..67781e9 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -21,6 +21,7 @@ from zipfile import ZipFile from defang import defang # type: ignore from har2tree import CrawledTree, HostNode, URLNode from PIL import Image, UnidentifiedImageError +from playwrightcapture import get_devices from pymisp import MISPAttribute, MISPEvent, MISPObject from redis import ConnectionPool, Redis from redis.connection import UnixDomainSocketConnection @@ -874,6 +875,9 @@ class Lookyloo(): ct = self.get_crawled_tree(tree_uuid) return {node.name for node in ct.root_hartree.url_tree.traverse()} + def get_playwright_devices(self): + return get_devices() + def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]: '''Gather all the informations needed to display the Hostnode investigator popup.''' diff --git a/poetry.lock b/poetry.lock index 0852516..ec9f0b7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -50,7 +50,7 @@ python-versions = "*" [[package]] name = "asttokens" -version = "2.0.7" +version = "2.0.8" description = "Annotate AST trees with source code positions" category = "dev" optional = false @@ -544,7 +544,7 @@ i18n = ["Babel (>=2.7)"] [[package]] name = "jsonschema" -version = "4.9.1" +version = "4.10.3" description = "An implementation of JSON Schema validation for Python" category = "main" optional = false @@ -592,7 +592,7 @@ python-versions = ">=3.7" [[package]] name = "matplotlib-inline" -version = "0.1.3" +version = "0.1.6" description = "Inline Matplotlib backend for Jupyter" category = "dev" optional = false @@ -724,7 +724,7 @@ python-versions = ">=3.6" [[package]] name = "playwright" -version = "1.24.1" +version = "1.25.1" description = "A high-level API to automate web browsers" category = "main" optional = false @@ -738,7 +738,7 @@ websockets = "10.1" [[package]] name = "playwrightcapture" -version = "1.14.0" +version = "1.14.1" description = "A simple library to capture websites using playwright" category = "main" optional = false @@ -746,7 +746,7 @@ python-versions = ">=3.8,<4.0" [package.dependencies] dateparser = ">=1.1.1,<2.0.0" -playwright = ">=1.24.1,<2.0.0" +playwright = ">=1.25.1,<2.0.0" [package.extras] recaptcha = ["requests (>=2.28.1,<3.0.0)", "pydub (>=0.25.1,<0.26.0)", "SpeechRecognition (>=3.8.1,<4.0.0)"] @@ -838,12 +838,15 @@ python-versions = "*" [[package]] name = "pygments" -version = "2.12.0" +version = "2.13.0" description = "Pygments is a syntax highlighting package written in Python." category = "main" optional = false python-versions = ">=3.6" +[package.extras] +plugins = ["importlib-metadata"] + [[package]] name = "pyhashlookup" version = "1.2.0" @@ -1410,7 +1413,7 @@ misp = ["python-magic", "pydeep2"] [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "ce26fadedd7a02ea21bfb8bcf8394e5f24e26e9033516cbd5c608ecaa954b95b" +content-hash = "d6422ef6314227f2b23aa623853c3b957e28a01da9a070cdfbd4a80fd4559d6f" [metadata.files] aiohttp = [ @@ -1500,8 +1503,8 @@ appnope = [ {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"}, ] asttokens = [ - {file = "asttokens-2.0.7-py2.py3-none-any.whl", hash = "sha256:f5589ef8518f73dd82c15e1c19f795d8a62c133485e557c04443d4a1a730cf9f"}, - {file = "asttokens-2.0.7.tar.gz", hash = "sha256:8444353e4e2a99661c8dfb85ec9c02eedded08f0006234bff7db44a06840acc2"}, + {file = "asttokens-2.0.8-py2.py3-none-any.whl", hash = "sha256:e3305297c744ae53ffa032c45dc347286165e4ffce6875dc662b205db0623d86"}, + {file = "asttokens-2.0.8.tar.gz", hash = "sha256:c61e16246ecfb2cde2958406b4c8ebc043c9e6d73aaa83c941673b35e5d3a76b"}, ] async-timeout = [ {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, @@ -1847,8 +1850,8 @@ jinja2 = [ {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, ] jsonschema = [ - {file = "jsonschema-4.9.1-py3-none-any.whl", hash = "sha256:8ebad55894c002585271af2d327d99339ef566fb085d9129b69e2623867c4106"}, - {file = "jsonschema-4.9.1.tar.gz", hash = "sha256:408c4c8ed0dede3b268f7a441784f74206380b04f93eb2d537c7befb3df3099f"}, + {file = "jsonschema-4.10.3-py3-none-any.whl", hash = "sha256:443442f9ac2fdfde7bc99079f0ba08e5d167fc67749e9fc706a393bc8857ca48"}, + {file = "jsonschema-4.10.3.tar.gz", hash = "sha256:59ad13764820eb9d2cafc6db32e92fabd318c1e4e3f2205e646225283704a2c3"}, ] lief = [ {file = "lief-0.12.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fbbc9d520de87ac22210c62d22a9b088e5460f9a028741311e6f68ef8877ddd"}, @@ -1992,8 +1995,8 @@ markupsafe = [ {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] matplotlib-inline = [ - {file = "matplotlib-inline-0.1.3.tar.gz", hash = "sha256:a04bfba22e0d1395479f866853ec1ee28eea1485c1d69a6faf00dc3e24ff34ee"}, - {file = "matplotlib_inline-0.1.3-py3-none-any.whl", hash = "sha256:aed605ba3b72462d64d475a21a9296f400a19c4f74a31b59103d2a99ffd5aa5c"}, + {file = "matplotlib-inline-0.1.6.tar.gz", hash = "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"}, + {file = "matplotlib_inline-0.1.6-py3-none-any.whl", hash = "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311"}, ] multidict = [ {file = "multidict-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b9e95a740109c6047602f4db4da9949e6c5945cefbad34a1299775ddc9a62e2"}, @@ -2200,17 +2203,17 @@ pkgutil-resolve-name = [ {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"}, ] playwright = [ - {file = "playwright-1.24.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:04575c6682098a2f5e851a3fb04bc0e3269af6e00732798103a02f63e4409dfb"}, - {file = "playwright-1.24.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:279c8f150662292814499b61ccbc35917f5041592ede1853e2a46e59456500df"}, - {file = "playwright-1.24.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:e0821859b625ad3dd1c89c9359d5d10d48ce1af81dc99d702437721ab72cd150"}, - {file = "playwright-1.24.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:f6add5cc97a8b3bca775c38ba3382fe73004d59adc9e404bf59f23f35442ede7"}, - {file = "playwright-1.24.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3f2acda4c22e78c94a47eb5b6f77f20c0313ec76f8e19d77c09bee1c44631d9"}, - {file = "playwright-1.24.1-py3-none-win32.whl", hash = "sha256:5ad16b26f4ae2d539c580273bd323907e24a125dbf0f38460466608d1cb39c83"}, - {file = "playwright-1.24.1-py3-none-win_amd64.whl", hash = "sha256:7355d00cd2cb24265779b1cd9b18d9dcac17eea4a558d57ce20df650fc766f53"}, + {file = "playwright-1.25.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:1dbe89f4e3dae53add2c6b642cd07c44474eaba88593e29be7ae82106ede8e63"}, + {file = "playwright-1.25.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58641991bcf43ade2a0740ece6e9d22deff228a6358f9aa61a290b7c4ab6f6ab"}, + {file = "playwright-1.25.1-py3-none-macosx_11_0_universal2.whl", hash = "sha256:426f2e839671b6fe803a87ce3c7b38a8b3c552565863700791238a97f5f1ad24"}, + {file = "playwright-1.25.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:25b7ca2ee2bdf668dc487563355f42fc354bf5a386eaf639ace44133af7c7ab3"}, + {file = "playwright-1.25.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de9cd487b28e7d03eb04ab8f8e23bfa75c18dffc897396dffa8e9f1be0982d22"}, + {file = "playwright-1.25.1-py3-none-win32.whl", hash = "sha256:ca66ec55858fddfb0255a35c4c320795178b69424a51f95fe09530fed71e9abf"}, + {file = "playwright-1.25.1-py3-none-win_amd64.whl", hash = "sha256:d5c64d4b6f4ab56ea0acf5446f3aa3834beea8d871c58a49eff189aa3cf85d13"}, ] playwrightcapture = [ - {file = "PlaywrightCapture-1.14.0-py3-none-any.whl", hash = "sha256:490df4f16f057c2b1c169aaf037d5906981c1ab2d545b17fe54d89be61b61436"}, - {file = "PlaywrightCapture-1.14.0.tar.gz", hash = "sha256:22e01bbb41581e7ead3a783177fead523b216030b609a19313223381468e11fb"}, + {file = "PlaywrightCapture-1.14.1-py3-none-any.whl", hash = "sha256:9f83f65c3842825a15b05266aa254bffc598aa53727f57b00bdf3f947418fde1"}, + {file = "PlaywrightCapture-1.14.1.tar.gz", hash = "sha256:9dbffb9336a7537697a90e02c454b2a17af1e4bfff281ce831445c70b7f973cd"}, ] prompt-toolkit = [ {file = "prompt_toolkit-3.0.30-py3-none-any.whl", hash = "sha256:d8916d3f62a7b67ab353a952ce4ced6a1d2587dfe9ef8ebc30dd7c386751f289"}, @@ -2258,8 +2261,8 @@ pyfaup = [ {file = "pyfaup-1.2.tar.gz", hash = "sha256:5648bc3ebd80239aec927aedfc218c3a6ff36de636cc53822bfeb70b0869b1e7"}, ] pygments = [ - {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"}, - {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"}, + {file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"}, + {file = "Pygments-2.13.0.tar.gz", hash = "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1"}, ] pyhashlookup = [ {file = "pyhashlookup-1.2.0-py3-none-any.whl", hash = "sha256:219a16381330b9ca6d9f36f514583bc0cfdb04ff44fd6a8d5e9be18e3497979c"}, diff --git a/pyproject.toml b/pyproject.toml index d413246..6714cb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,7 @@ lief = "^0.12.1" ua-parser = "^0.15.0" Flask-Login = "^0.6.2" har2tree = "^1.14.1" -playwrightcapture = "^1.14.0" +playwrightcapture = "^1.14.1" passivetotal = "^2.5.9" werkzeug = "2.1.2" filetype = "^1.1.0" diff --git a/website/web/genericapi.py b/website/web/genericapi.py index d037505..06af4f3 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -299,6 +299,14 @@ class InstanceStats(Resource): return lookyloo.get_stats() +@api.route('/json/devices') +@api.doc(description='Get the list of devices pre-configured on the platform') +class Devices(Resource): + + def get(self): + return lookyloo.get_playwright_devices() + + @api.route('/json//stats') @api.doc(description='Get the statistics of the capture.', params={'capture_uuid': 'The UUID of the capture'}) @@ -331,8 +339,10 @@ submit_fields_post = api.model('SubmitFieldsPost', { 'document_name': fields.String(description="The name of the document."), 'listing': fields.Integer(description="Display the capture on the index", min=0, max=1, example=1), 'user_agent': fields.String(description="User agent to use for the capture", example=''), + 'browser_name': fields.String(description="Use this browser. Must be chromium, firefox or webkit.", example=''), + 'device_name': fields.String(description="Use the pre-configured settings for this device. Get a list from /json/devices.", example=''), 'referer': fields.String(description="Referer to pass to the capture", example=''), - 'headers': fields.String(description="Referer to pass to the capture", example='Accept-Language: en-US;q=0.5, fr-FR;q=0.4'), + 'headers': fields.String(description="Headers to pass to the capture", example='Accept-Language: en-US;q=0.5, fr-FR;q=0.4'), 'proxy': fields.Url(description="Proxy to use for the capture. Format: [scheme]://[username]:[password]@[hostname]:[port]", example=''), 'cookies': fields.String(description="JSON export of a list of cookies as exported from an other capture", example='') }) @@ -344,6 +354,8 @@ class SubmitCapture(Resource): @api.param('url', 'The URL to capture', required=True) @api.param('listing', 'Display the capture on the index', default=1) @api.param('user_agent', 'User agent to use for the capture') + @api.param('browser_name', 'Use this browser. Must be chromium, firefox or webkit.') + @api.param('device_name', 'Use the pre-configured settings for this device') @api.param('referer', 'Referer to pass to the capture') @api.param('proxy', 'Proxy to use for the the capture') @api.produces(['text/text']) @@ -360,6 +372,10 @@ class SubmitCapture(Resource): 'listing': False if 'listing' in request.args and request.args['listing'] in [0, '0'] else True} if request.args.get('user_agent'): to_query['user_agent'] = request.args['user_agent'] + if request.args.get('browser_name'): + to_query['browser_name'] = request.args['browser_name'] + if request.args.get('device_name'): + to_query['device_name'] = request.args['device_name'] if request.args.get('referer'): to_query['referer'] = request.args['referer'] if request.args.get('headers'):