new: Switch away from splash to use playwright

2022-04-21 14:53:42 +03:00 · 2022-04-21 14:53:42 +03:00 · 8d159ffba0
parent 3839c34e39
commit 8d159ffba0
6 changed files with 245 additions and 707 deletions
--- a/bin/async_capture.py
+++ b/bin/async_capture.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3

-import base64
+import asyncio
 import ipaddress
 import json
 import logging
@ -13,10 +13,10 @@ from urllib.parse import urlsplit

 from defang import refang  # type: ignore
 from redis import Redis
-from scrapysplashwrapper import crawl
+from playwrightcapture import Capture

 from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
-from lookyloo.helpers import get_captures_dir, get_splash_url, load_cookies, splash_status
+from lookyloo.helpers import get_captures_dir, load_cookies

 logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
                    level=logging.INFO)
@ -29,10 +29,9 @@ class AsyncCapture(AbstractManager):
        self.script_name = 'async_capture'
        self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
        self.capture_dir: Path = get_captures_dir()
-        self.splash_url: str = get_splash_url()
        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)

-    def process_capture_queue(self) -> None:
+    async def process_capture_queue(self) -> None:
        '''Process a query from the capture queue'''
        value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture')  # type: ignore
        if not value or not value[0]:
@ -69,11 +68,10 @@ class AsyncCapture(AbstractManager):
                            headers[header.strip()] = h_value.strip()

        self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
-        success, error_message = self._capture(
+        success, error_message = await self._capture(
            to_capture['url'],
            perma_uuid=uuid,
            cookies_pseudofile=to_capture.get('cookies', None),
-            depth=int(to_capture.get('depth', 1)),
            listing=listing,
            user_agent=to_capture.get('user_agent', None),
            referer=to_capture.get('referer', None),
@ -94,17 +92,18 @@ class AsyncCapture(AbstractManager):
        lazy_cleanup.expire('queues', 600)
        lazy_cleanup.execute()

-    def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
-                 depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
-                 referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None, proxy: Optional[str]=None, os: Optional[str]=None,
-                 browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
+    async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
+                       listing: bool=True, user_agent: Optional[str]=None,
+                       referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,
+                       proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
+                       browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
        '''Launch a capture'''
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
+        splitted_url = urlsplit(url)
        if self.only_global_lookups:
-            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    if splitted_url.hostname.split('.')[-1] != 'onion':
@ -118,6 +117,11 @@ class AsyncCapture(AbstractManager):
            else:
                return False, 'Unable to find hostname or IP in the query.'

+        # check if onion
+        if (not proxy and splitted_url.netloc and splitted_url.hostname
+                and splitted_url.hostname.split('.')[-1] == 'onion'):
+            proxy = get_config('generic', 'tor_proxy')
+
        cookies = load_cookies(cookies_pseudofile)
        if not user_agent:
            # Catch case where the UA is broken on the UI, and the async submission.
@ -125,22 +129,26 @@ class AsyncCapture(AbstractManager):
        else:
            ua = user_agent

-        if int(depth) > int(get_config('generic', 'max_depth')):
-            self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
-            depth = int(get_config('generic', 'max_depth'))
        self.logger.info(f'Capturing {url}')
        try:
-            items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
-                          referer=referer, headers=headers, proxy=proxy, log_enabled=True,
-                          log_level=get_config('generic', 'splash_loglevel'))
+            capture = Capture()
+            if proxy:
+                await capture.prepare_capture(proxy=proxy)
+            else:
+                await capture.prepare_capture()
+            capture.cookies = cookies
+            capture.user_agent = ua
+            if headers:
+                capture.http_headers = headers
+            await capture.prepare_context()
+            entries = await capture.capture_page(url, referer=referer)
        except Exception as e:
            self.logger.critical(f'Something went terribly wrong when capturing {url}.')
            raise e
-        if not items:
+        if not entries:
            # broken
            self.logger.critical(f'Something went terribly wrong when capturing {url}.')
            return False, f'Something went terribly wrong when capturing {url}.'
-        width = len(str(len(items)))
        now = datetime.now()
        dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
        safe_create_dir(dirpath)
@ -167,53 +175,42 @@ class AsyncCapture(AbstractManager):
            with (dirpath / 'parent').open('w') as _parent:
                _parent.write(parent)

-        for i, item in enumerate(items):
-            if 'error' in item:
-                with (dirpath / 'error.txt').open('w') as _error:
-                    json.dump(item['error'], _error)
+        if 'error' in entries:
+            with (dirpath / 'error.txt').open('w') as _error:
+                json.dump(entries['error'], _error)

-            # The capture went fine
-            harfile = item['har']
-            png = base64.b64decode(item['png'])
-            html = item['html']
-            last_redirect = item['last_redirected_url']
+        # The capture went fine
+        harfile = entries['har']
+        png = entries['png']
+        html = entries['html']
+        last_redirect = entries['last_redirected_url']

-            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
-                json.dump(harfile, _har)
-            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
-                _img.write(png)
-            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
-                _html.write(html)
-            with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
-                _redir.write(last_redirect)
+        with (dirpath / '0.har').open('w') as _har:
+            json.dump(harfile, _har)
+        with (dirpath / '0.png').open('wb') as _img:
+            _img.write(png)
+        with (dirpath / '0.html').open('w') as _html:
+            _html.write(html)
+        with (dirpath / '0.last_redirect.txt').open('w') as _redir:
+            _redir.write(last_redirect)

-            if 'childFrames' in item:
-                child_frames = item['childFrames']
-                with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
-                    json.dump(child_frames, _iframes)
-
-            if 'cookies' in item:
-                cookies = item['cookies']
-                with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
-                    json.dump(cookies, _cookies)
+        if 'cookies' in entries:
+            cookies = entries['cookies']
+            with (dirpath / '0.cookies.json').open('w') as _cookies:
+                json.dump(cookies, _cookies)
        self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
        return True, 'All good!'

-    def _to_run_forever(self):
+    async def _to_run_forever_async(self):
        while self.redis.exists('to_capture'):
-            status, message = splash_status()
-            if not status:
-                self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
-                break
-
-            self.process_capture_queue()
+            await self.process_capture_queue()
            if self.shutdown_requested():
                break


 def main():
    m = AsyncCapture()
-    m.run(sleep_in_sec=1)
+    asyncio.run(m.run_async(sleep_in_sec=1))


 if __name__ == '__main__':
--- a/config/generic.json.sample
+++ b/config/generic.json.sample
@ -25,6 +25,9 @@
  "enable_bookmark": false,
  "auto_trigger_modules": false,
  "enable_mail_notification": false,
+  "tor_proxy": {
+    "server": "socks5://127.0.0.1:9050"
+  },
  "email": {
    "from": "Lookyloo <lookyloo@myorg.local>",
    "to": "Investigation Team <investigation_unit@myorg.local>",
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -335,7 +335,7 @@ class Lookyloo():
            self.logger.warning(e)
            return None
        except Exception as e:
-            self.logger.critical(e)
+            self.logger.exception(e)
            return None

    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,9 +46,7 @@ bootstrap-flask = "^2.0.2"
 defang = "^0.5.3"
 vt-py = "^0.14.0"
 pyeupi = "^1.1"
-scrapysplashwrapper = "^1.11.0"
 pysanejs = "^2.0"
-har2tree = "^1.11.0"
 pylookyloo = "^1.10.0"
 dnspython = "^2.2.1"
 pytaxonomies = "^1.4.1"
@ -66,6 +64,8 @@ pyhashlookup = "^1.1.1"
 lief = "^0.12.1"
 ua-parser = "^0.10.0"
 Flask-Login = "^0.6.0"
+playwrightcapture = {git = "https://github.com/Lookyloo/PlaywrightCapture.git", rev = "main"}
+har2tree = {git = "https://github.com/Lookyloo/har2tree.git", rev = "main"}

 [tool.poetry.extras]
 misp = ['python-magic', 'pydeep']
--- a/website/web/init.py
+++ b/website/web/init.py
@ -24,7 +24,7 @@ from werkzeug.security import check_password_hash
 from lookyloo.default import get_config
 from lookyloo.exceptions import MissingUUID, NoValidHarFile
 from lookyloo.helpers import (CaptureStatus, get_taxonomies,
-                              get_user_agents, load_cookies, splash_status)
+                              get_user_agents, load_cookies)
 from lookyloo.lookyloo import Indexing, Lookyloo

 from .genericapi import api as generic_api
@ -600,10 +600,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
    cache = lookyloo.capture_cache(tree_uuid)
    if not cache:
        status = lookyloo.get_capture_status(tree_uuid)
-        splash_up, splash_message = splash_status()
-        if not splash_up:
-            flash(f'The capture module is not reachable ({splash_message}).', 'error')
-            flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
        if status == CaptureStatus.UNKNOWN:
            error = lookyloo.try_error_status(tree_uuid)
            if error:
@ -810,10 +806,6 @@ def _prepare_capture_template(user_ua: Optional[str], predefined_url: Optional[s
        if 'bot' not in ua['useragent'].lower():
            default_ua = ua
            break
-    splash_up, message = splash_status()
-    if not splash_up:
-        flash(f'The capture module is not reachable ({message}).', 'error')
-        flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
    return render_template('capture.html', user_agents=user_agents, default=default_ua,
                           max_depth=max_depth, personal_ua=user_ua,
                           default_public=get_config('generic', 'default_public'),