lookyloo/bin/async_capture.py

#!/usr/bin/env python3

import asyncio
import ipaddress
import json
import logging
import socket
from datetime import datetime
from io import BufferedIOBase
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import urlsplit

from defang import refang  # type: ignore
from redis import Redis
from playwrightcapture import Capture

from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
from lookyloo.helpers import get_captures_dir, load_cookies

logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
                    level=logging.INFO)


class AsyncCapture(AbstractManager):

    def __init__(self, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        self.script_name = 'async_capture'
        self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
        self.capture_dir: Path = get_captures_dir()
        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)

    async def process_capture_queue(self) -> None:
        '''Process a query from the capture queue'''
        value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture')  # type: ignore
        if not value or not value[0]:
            # The queue was consumed by an other process.
            return
        uuid, _score = value[0]
        queue: Optional[str] = self.redis.get(f'{uuid}_mgmt')
        self.redis.sadd('ongoing', uuid)

        lazy_cleanup = self.redis.pipeline()
        lazy_cleanup.delete(f'{uuid}_mgmt')
        if queue:
            # queue shouldn't be none, but if it is, just ignore.
            lazy_cleanup.zincrby('queues', -1, queue)

        to_capture: Dict[str, str] = self.redis.hgetall(uuid)

        if get_config('generic', 'default_public'):
            # By default, the captures are on the index, unless the user mark them as un-listed
            listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True
        else:
            # By default, the captures are not on the index, unless the user mark them as listed
            listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False

        # Turn the freetext for the headers into a dict
        headers = {}
        if 'headers' in to_capture:
            for header_line in to_capture['headers'].splitlines():
                if header_line and ':' in header_line:
                    splitted = header_line.split(':', 1)
                    if splitted and len(splitted) == 2:
                        header, h_value = splitted
                        if header and h_value:
                            headers[header.strip()] = h_value.strip()

        self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
        success, error_message = await self._capture(
            to_capture['url'],
            perma_uuid=uuid,
            cookies_pseudofile=to_capture.get('cookies', None),
            listing=listing,
            user_agent=to_capture.get('user_agent', None),
            referer=to_capture.get('referer', None),
            headers=headers if headers else None,
            proxy=to_capture.get('proxy', None),
            os=to_capture.get('os', None),
            browser=to_capture.get('browser', None),
            parent=to_capture.get('parent', None)
        )
        if success:
            self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}')
        else:
            self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}')
            lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}')
        lazy_cleanup.srem('ongoing', uuid)
        lazy_cleanup.delete(uuid)
        # make sure to expire the key if nothing was processed for a while (= queues empty)
        lazy_cleanup.expire('queues', 600)
        lazy_cleanup.execute()

    async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
                       listing: bool=True, user_agent: Optional[str]=None,
                       referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,
                       proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
                       browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
        '''Launch a capture'''
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        splitted_url = urlsplit(url)
        if self.only_global_lookups:
            if splitted_url.netloc:
                if splitted_url.hostname:
                    if splitted_url.hostname.split('.')[-1] != 'onion':
                        try:
                            ip = socket.gethostbyname(splitted_url.hostname)
                        except socket.gaierror:
                            self.logger.info('Name or service not known')
                            return False, 'Name or service not known.'
                        if not ipaddress.ip_address(ip).is_global:
                            return False, 'Capturing ressources on private IPs is disabled.'
            else:
                return False, 'Unable to find hostname or IP in the query.'

        # check if onion
        if (not proxy and splitted_url.netloc and splitted_url.hostname
                and splitted_url.hostname.split('.')[-1] == 'onion'):
            proxy = get_config('generic', 'tor_proxy')

        cookies = load_cookies(cookies_pseudofile)
        if not user_agent:
            # Catch case where the UA is broken on the UI, and the async submission.
            ua: str = get_config('generic', 'default_user_agent')
        else:
            ua = user_agent

        self.logger.info(f'Capturing {url}')
        try:
            capture = Capture()
            if proxy:
                await capture.prepare_capture(proxy=proxy)
            else:
                await capture.prepare_capture()
            capture.prepare_cookies(cookies)
            capture.user_agent = ua
            if headers:
                capture.http_headers = headers
            await capture.prepare_context()
            entries = await capture.capture_page(url, referer=referer)
        except Exception as e:
            self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}')
            return False, f'Something went terribly wrong when capturing {url}.'

        if not entries:
            # broken
            self.logger.critical(f'Something went terribly wrong when capturing {url}.')
            return False, f'Something went terribly wrong when capturing {url}.'
        now = datetime.now()
        dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
        safe_create_dir(dirpath)

        if os or browser:
            meta = {}
            if os:
                meta['os'] = os
            if browser:
                meta['browser'] = browser
            with (dirpath / 'meta').open('w') as _meta:
                json.dump(meta, _meta)

        # Write UUID
        with (dirpath / 'uuid').open('w') as _uuid:
            _uuid.write(perma_uuid)

        # Write no_index marker (optional)
        if not listing:
            (dirpath / 'no_index').touch()

        # Write parent UUID (optional)
        if parent:
            with (dirpath / 'parent').open('w') as _parent:
                _parent.write(parent)

        if 'error' in entries:
            with (dirpath / 'error.txt').open('w') as _error:
                json.dump(entries['error'], _error)
            return False, entries['error'] if entries['error'] else "Unknown error"

        # The capture went fine
        harfile = entries['har']
        png = entries['png']
        html = entries['html']
        last_redirect = entries['last_redirected_url']

        with (dirpath / '0.har').open('w') as _har:
            json.dump(harfile, _har)
        with (dirpath / '0.png').open('wb') as _img:
            _img.write(png)
        with (dirpath / '0.html').open('w') as _html:
            _html.write(html)
        with (dirpath / '0.last_redirect.txt').open('w') as _redir:
            _redir.write(last_redirect)

        if 'cookies' in entries:
            with (dirpath / '0.cookies.json').open('w') as _cookies:
                json.dump(entries['cookies'], _cookies)
        self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
        return True, 'All good!'

    async def _to_run_forever_async(self):
        while self.redis.exists('to_capture'):
            await self.process_capture_queue()
            if self.shutdown_requested():
                break


def main():
    m = AsyncCapture()
    asyncio.run(m.run_async(sleep_in_sec=1))


if __name__ == '__main__':
    main()
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`#!/usr/bin/env python3`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`import asyncio`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`import ipaddress`
			`import json`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`import logging`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`import socket`
			`from datetime import datetime`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from io import BufferedIOBase`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`from pathlib import Path`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from typing import Dict, List, Optional, Tuple, Union`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`from urllib.parse import urlsplit`

			`from defang import refang # type: ignore`
			`from redis import Redis`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`from playwrightcapture import Capture`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
chg: use template 2021-10-18 13:06:43 +02:00			`from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`from lookyloo.helpers import get_captures_dir, load_cookies`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
			`logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',`
fix: remove datefmt from logging.basicConfig, it was a bad idea. 2021-09-01 10:40:59 +02:00			`level=logging.INFO)`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00

fix: Rename scrape -> capture in async 2020-11-05 14:14:33 +01:00			`class AsyncCapture(AbstractManager):`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
chg: Remove useless code 2021-03-12 16:49:04 +01:00			`def __init__(self, loglevel: int=logging.INFO):`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`super().__init__(loglevel)`
fix: Move set/unset running to abstract Avoid issues when a script fails unexpectedly. 2021-04-09 14:33:40 +02:00			`self.script_name = 'async_capture'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')`
			`self.capture_dir: Path = get_captures_dir()`
			`self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`async def process_capture_queue(self) -> None:`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`'''Process a query from the capture queue'''`
			`value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore`
			`if not value or not value[0]:`
chg: More cleanup, support clean shutdown of multiple async captures 2021-08-25 16:40:51 +02:00			`# The queue was consumed by an other process.`
			`return`
			`uuid, _score = value[0]`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`queue: Optional[str] = self.redis.get(f'{uuid}_mgmt')`
			`self.redis.sadd('ongoing', uuid)`

			`lazy_cleanup = self.redis.pipeline()`
			`lazy_cleanup.delete(f'{uuid}_mgmt')`
			`if queue:`
			`# queue shouldn't be none, but if it is, just ignore.`
			`lazy_cleanup.zincrby('queues', -1, queue)`

			`to_capture: Dict[str, str] = self.redis.hgetall(uuid)`

fix: Use default_public for all capture, including if submitted via the API 2021-11-02 22:58:31 +01:00			`if get_config('generic', 'default_public'):`
			`# By default, the captures are on the index, unless the user mark them as un-listed`
			`listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True`
			`else:`
			`# By default, the captures are not on the index, unless the user mark them as listed`
			`listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False`

new: Pass optional arbitrary HTTP headers to capture 2021-11-23 21:59:56 +01:00			`# Turn the freetext for the headers into a dict`
			`headers = {}`
			`if 'headers' in to_capture:`
			`for header_line in to_capture['headers'].splitlines():`
			`if header_line and ':' in header_line:`
			`splitted = header_line.split(':', 1)`
			`if splitted and len(splitted) == 2:`
			`header, h_value = splitted`
			`if header and h_value:`
			`headers[header.strip()] = h_value.strip()`

chg: Improve logging. 2021-09-01 14:08:25 +02:00			`self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`success, error_message = await self._capture(`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`to_capture['url'],`
			`perma_uuid=uuid,`
			`cookies_pseudofile=to_capture.get('cookies', None),`
fix: Use default_public for all capture, including if submitted via the API 2021-11-02 22:58:31 +01:00			`listing=listing,`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`user_agent=to_capture.get('user_agent', None),`
chg: Cleanup passing listing key to and from bool in redis 2021-09-10 14:20:58 +02:00			`referer=to_capture.get('referer', None),`
new: Pass optional arbitrary HTTP headers to capture 2021-11-23 21:59:56 +01:00			`headers=headers if headers else None,`
chg: Cleanup passing listing key to and from bool in redis 2021-09-10 14:20:58 +02:00			`proxy=to_capture.get('proxy', None),`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`os=to_capture.get('os', None),`
			`browser=to_capture.get('browser', None),`
			`parent=to_capture.get('parent', None)`
			`)`
			`if success:`
chg: Improve logging. 2021-09-01 14:08:25 +02:00			`self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}')`
chg: More cleanup, support clean shutdown of multiple async captures 2021-08-25 16:40:51 +02:00			`else:`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}')`
			`lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}')`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`lazy_cleanup.srem('ongoing', uuid)`
			`lazy_cleanup.delete(uuid)`
chg: More cleanup, support clean shutdown of multiple async captures 2021-08-25 16:40:51 +02:00			`# make sure to expire the key if nothing was processed for a while (= queues empty)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`lazy_cleanup.expire('queues', 600)`
			`lazy_cleanup.execute()`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,`
			`listing: bool=True, user_agent: Optional[str]=None,`
			`referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,`
			`proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,`
			`browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`'''Launch a capture'''`
			`url = url.strip()`
			`url = refang(url)`
			`if not url.startswith('http'):`
			`url = f'http://{url}'`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`splitted_url = urlsplit(url)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`if self.only_global_lookups:`
			`if splitted_url.netloc:`
			`if splitted_url.hostname:`
			`if splitted_url.hostname.split('.')[-1] != 'onion':`
			`try:`
			`ip = socket.gethostbyname(splitted_url.hostname)`
			`except socket.gaierror:`
			`self.logger.info('Name or service not known')`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`return False, 'Name or service not known.'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`if not ipaddress.ip_address(ip).is_global:`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`return False, 'Capturing ressources on private IPs is disabled.'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`else:`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`return False, 'Unable to find hostname or IP in the query.'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`# check if onion`
			`if (not proxy and splitted_url.netloc and splitted_url.hostname`
			`and splitted_url.hostname.split('.')[-1] == 'onion'):`
			`proxy = get_config('generic', 'tor_proxy')`

chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`cookies = load_cookies(cookies_pseudofile)`
			`if not user_agent:`
			`# Catch case where the UA is broken on the UI, and the async submission.`
			`ua: str = get_config('generic', 'default_user_agent')`
			`else:`
			`ua = user_agent`

			`self.logger.info(f'Capturing {url}')`
			`try:`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`capture = Capture()`
			`if proxy:`
			`await capture.prepare_capture(proxy=proxy)`
			`else:`
			`await capture.prepare_capture()`
chg: properly set cookies 2022-04-24 19:17:54 +02:00			`capture.prepare_cookies(cookies)`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`capture.user_agent = ua`
			`if headers:`
			`capture.http_headers = headers`
			`await capture.prepare_context()`
			`entries = await capture.capture_page(url, referer=referer)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`except Exception as e:`
fix: issue in playwrightcapture module 2022-04-25 15:20:05 +02:00			`self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}')`
fix: better handling if capture fails. 2022-04-21 14:48:28 +02:00			`return False, f'Something went terribly wrong when capturing {url}.'`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`if not entries:`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`# broken`
			`self.logger.critical(f'Something went terribly wrong when capturing {url}.')`
new: Pass optional arbitrary HTTP headers to capture 2021-11-23 21:59:56 +01:00			`return False, f'Something went terribly wrong when capturing {url}.'`
chg: Improve storage, support both modes. 2021-08-26 15:49:19 +02:00			`now = datetime.now()`
			`dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`safe_create_dir(dirpath)`

			`if os or browser:`
			`meta = {}`
			`if os:`
			`meta['os'] = os`
			`if browser:`
			`meta['browser'] = browser`
			`with (dirpath / 'meta').open('w') as _meta:`
			`json.dump(meta, _meta)`

			`# Write UUID`
			`with (dirpath / 'uuid').open('w') as _uuid:`
			`_uuid.write(perma_uuid)`

			`# Write no_index marker (optional)`
			`if not listing:`
			`(dirpath / 'no_index').touch()`

			`# Write parent UUID (optional)`
			`if parent:`
			`with (dirpath / 'parent').open('w') as _parent:`
			`_parent.write(parent)`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`if 'error' in entries:`
			`with (dirpath / 'error.txt').open('w') as _error:`
			`json.dump(entries['error'], _error)`
fix: Mypy, docker 2022-04-26 00:59:57 +02:00			`return False, entries['error'] if entries['error'] else "Unknown error"`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00
			`# The capture went fine`
			`harfile = entries['har']`
			`png = entries['png']`
			`html = entries['html']`
			`last_redirect = entries['last_redirected_url']`

			`with (dirpath / '0.har').open('w') as _har:`
			`json.dump(harfile, _har)`
			`with (dirpath / '0.png').open('wb') as _img:`
			`_img.write(png)`
			`with (dirpath / '0.html').open('w') as _html:`
			`_html.write(html)`
			`with (dirpath / '0.last_redirect.txt').open('w') as _redir:`
			`_redir.write(last_redirect)`

			`if 'cookies' in entries:`
			`with (dirpath / '0.cookies.json').open('w') as _cookies:`
chg: Use packaged playwright capture module 2022-04-25 13:34:01 +02:00			`json.dump(entries['cookies'], _cookies)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`return True, 'All good!'`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`async def _to_run_forever_async(self):`
chg: More cleanup, support clean shutdown of multiple async captures 2021-08-25 16:40:51 +02:00			`while self.redis.exists('to_capture'):`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`await self.process_capture_queue()`
chg: Move the process managment methods to the proper class 2021-08-27 17:28:26 +02:00			`if self.shutdown_requested():`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`break`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00

chg: Bump minimal version of poetry, bump deps, fix pyproject 2020-10-03 21:19:43 +02:00			`def main():`
fix: Rename scrape -> capture in async 2020-11-05 14:14:33 +01:00			`m = AsyncCapture()`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`asyncio.run(m.run_async(sleep_in_sec=1))`
chg: Bump minimal version of poetry, bump deps, fix pyproject 2020-10-03 21:19:43 +02:00

			`if __name__ == '__main__':`
			`main()`