lookyloo/bin/async_capture.py

#!/usr/bin/env python3

import asyncio
import ipaddress
import json
import logging
import socket
from datetime import datetime
from io import BufferedIOBase
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import urlsplit

from defang import refang  # type: ignore
from redis.asyncio import Redis
from playwrightcapture import Capture

from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
from lookyloo.helpers import get_captures_dir, load_cookies, UserAgents

from lookyloo.modules import FOX

logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
                    level=logging.INFO)


class AsyncCapture(AbstractManager):

    def __init__(self, loglevel: int=logging.INFO):
        super().__init__(loglevel)
        self.script_name = 'async_capture'
        self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
        self.capture_dir: Path = get_captures_dir()
        self.user_agents = UserAgents()

        self.fox = FOX(get_config('modules', 'FOX'))
        if not self.fox.available:
            self.logger.warning('Unable to setup the FOX module')

    def thirdparty_submit(self, capture_data: Dict[str, str]) -> None:
        if self.fox.available:
            self.fox.capture_default_trigger(capture_data['url'], auto_trigger=True)

    async def process_capture_queue(self) -> None:
        '''Process a query from the capture queue'''
        value: List[Tuple[str, float]] = await self.redis.zpopmax('to_capture')
        if not value or not value[0]:
            # The queue was consumed by an other process.
            return
        uuid, _score = value[0]
        queue: Optional[str] = await self.redis.get(f'{uuid}_mgmt')
        await self.redis.sadd('ongoing', uuid)

        async with self.redis.pipeline() as lazy_cleanup:
            await lazy_cleanup.delete(f'{uuid}_mgmt')
            if queue:
                # queue shouldn't be none, but if it is, just ignore.
                await lazy_cleanup.zincrby('queues', -1, queue)

            to_capture: Dict[str, str] = await self.redis.hgetall(uuid)

            if get_config('generic', 'default_public'):
                # By default, the captures are on the index, unless the user mark them as un-listed
                listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True
            else:
                # By default, the captures are not on the index, unless the user mark them as listed
                listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False

            # Turn the freetext for the headers into a dict
            headers = {}
            if 'headers' in to_capture:
                for header_line in to_capture['headers'].splitlines():
                    if header_line and ':' in header_line:
                        splitted = header_line.split(':', 1)
                        if splitted and len(splitted) == 2:
                            header, h_value = splitted
                            if header and h_value:
                                headers[header.strip()] = h_value.strip()

            self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
            self.thirdparty_submit(to_capture)
            success, error_message = await self._capture(
                to_capture['url'],
                perma_uuid=uuid,
                cookies_pseudofile=to_capture.get('cookies', None),
                listing=listing,
                user_agent=to_capture.get('user_agent', None),
                referer=to_capture.get('referer', None),
                headers=headers if headers else None,
                proxy=to_capture.get('proxy', None),
                os=to_capture.get('os', None),
                browser=to_capture.get('browser', None),
                parent=to_capture.get('parent', None)
            )
            if success:
                self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}')
            else:
                self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}')
                await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}')
            await lazy_cleanup.srem('ongoing', uuid)
            await lazy_cleanup.delete(uuid)
            # make sure to expire the key if nothing was processed for a while (= queues empty)
            await lazy_cleanup.expire('queues', 600)
            await lazy_cleanup.execute()

    async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
                       listing: bool=True, user_agent: Optional[str]=None,
                       referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,
                       proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
                       browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
        '''Launch a capture'''
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        splitted_url = urlsplit(url)
        if self.only_global_lookups:
            if splitted_url.netloc:
                if splitted_url.hostname and splitted_url.hostname.split('.')[-1] != 'onion':
                    try:
                        ip = socket.gethostbyname(splitted_url.hostname)
                    except socket.gaierror:
                        self.logger.info('Name or service not known')
                        return False, 'Name or service not known.'
                    if not ipaddress.ip_address(ip).is_global:
                        return False, 'Capturing ressources on private IPs is disabled.'
            else:
                return False, 'Unable to find hostname or IP in the query.'

        # check if onion
        if (not proxy and splitted_url.netloc and splitted_url.hostname
                and splitted_url.hostname.split('.')[-1] == 'onion'):
            proxy = get_config('generic', 'tor_proxy')

        cookies = load_cookies(cookies_pseudofile)
        if not user_agent:
            # Catch case where the UA is broken on the UI, and the async submission.
            self.user_agents.user_agents  # triggers an update if needed
            ua: str = self.user_agents.default['useragent']
        else:
            ua = user_agent

        self.logger.info(f'Capturing {url}')
        try:
            async with Capture(proxy=proxy) as capture:
                capture.prepare_cookies(cookies)
                capture.user_agent = ua
                if headers:
                    capture.http_headers = headers
                await capture.prepare_context()
                entries = await capture.capture_page(url, referer=referer)
        except Exception as e:
            self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}')
            return False, f'Something went terribly wrong when capturing {url}.'

        if not entries:
            # broken
            self.logger.critical(f'Something went terribly wrong when capturing {url}.')
            return False, f'Something went terribly wrong when capturing {url}.'
        now = datetime.now()
        dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
        safe_create_dir(dirpath)

        if os or browser:
            meta = {}
            if os:
                meta['os'] = os
            if browser:
                meta['browser'] = browser
            with (dirpath / 'meta').open('w') as _meta:
                json.dump(meta, _meta)

        # Write UUID
        with (dirpath / 'uuid').open('w') as _uuid:
            _uuid.write(perma_uuid)

        # Write no_index marker (optional)
        if not listing:
            (dirpath / 'no_index').touch()

        # Write parent UUID (optional)
        if parent:
            with (dirpath / 'parent').open('w') as _parent:
                _parent.write(parent)

        if 'error' in entries:
            with (dirpath / 'error.txt').open('w') as _error:
                json.dump(entries['error'], _error)

        if 'har' not in entries:
            return False, entries['error'] if entries['error'] else "Unknown error"

        with (dirpath / '0.har').open('w') as _har:
            json.dump(entries['har'], _har)

        if 'png' in entries and entries['png']:
            with (dirpath / '0.png').open('wb') as _img:
                _img.write(entries['png'])

        if 'html' in entries and entries['html']:
            with (dirpath / '0.html').open('w') as _html:
                _html.write(entries['html'])

        if 'last_redirected_url' in entries and entries['last_redirected_url']:
            with (dirpath / '0.last_redirect.txt').open('w') as _redir:
                _redir.write(entries['last_redirected_url'])

        if 'cookies' in entries and entries['cookies']:
            with (dirpath / '0.cookies.json').open('w') as _cookies:
                json.dump(entries['cookies'], _cookies)
        await self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
        return True, 'All good!'

    async def _to_run_forever_async(self):
        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        while await self.redis.exists('to_capture'):
            await self.process_capture_queue()
            if self.shutdown_requested():
                break
        await self.redis.close()


def main():
    m = AsyncCapture()
    asyncio.run(m.run_async(sleep_in_sec=1))


if __name__ == '__main__':
    main()
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`#!/usr/bin/env python3`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`import asyncio`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`import ipaddress`
			`import json`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`import logging`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`import socket`
			`from datetime import datetime`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from io import BufferedIOBase`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`from pathlib import Path`
chg: reorder imports 2021-09-07 12:59:31 +02:00			`from typing import Dict, List, Optional, Tuple, Union`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`from urllib.parse import urlsplit`

			`from defang import refang # type: ignore`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`from redis.asyncio import Redis`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`from playwrightcapture import Capture`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
chg: use template 2021-10-18 13:06:43 +02:00			`from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir`
chg: Improve somewhat the useragents available for capturing Fix #416 2022-06-09 18:57:40 +02:00			`from lookyloo.helpers import get_captures_dir, load_cookies, UserAgents`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
new: autosubmit to FOX, bump deps 2022-05-02 13:04:55 +02:00			`from lookyloo.modules import FOX`

new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',`
fix: remove datefmt from logging.basicConfig, it was a bad idea. 2021-09-01 10:40:59 +02:00			`level=logging.INFO)`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00

fix: Rename scrape -> capture in async 2020-11-05 14:14:33 +01:00			`class AsyncCapture(AbstractManager):`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
chg: Remove useless code 2021-03-12 16:49:04 +01:00			`def __init__(self, loglevel: int=logging.INFO):`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00			`super().__init__(loglevel)`
fix: Move set/unset running to abstract Avoid issues when a script fails unexpectedly. 2021-04-09 14:33:40 +02:00			`self.script_name = 'async_capture'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')`
			`self.capture_dir: Path = get_captures_dir()`
chg: Improve somewhat the useragents available for capturing Fix #416 2022-06-09 18:57:40 +02:00			`self.user_agents = UserAgents()`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
new: autosubmit to FOX, bump deps 2022-05-02 13:04:55 +02:00			`self.fox = FOX(get_config('modules', 'FOX'))`
			`if not self.fox.available:`
			`self.logger.warning('Unable to setup the FOX module')`

			`def thirdparty_submit(self, capture_data: Dict[str, str]) -> None:`
			`if self.fox.available:`
			`self.fox.capture_default_trigger(capture_data['url'], auto_trigger=True)`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`async def process_capture_queue(self) -> None:`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`'''Process a query from the capture queue'''`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`value: List[Tuple[str, float]] = await self.redis.zpopmax('to_capture')`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`if not value or not value[0]:`
chg: More cleanup, support clean shutdown of multiple async captures 2021-08-25 16:40:51 +02:00			`# The queue was consumed by an other process.`
			`return`
			`uuid, _score = value[0]`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`queue: Optional[str] = await self.redis.get(f'{uuid}_mgmt')`
			`await self.redis.sadd('ongoing', uuid)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`async with self.redis.pipeline() as lazy_cleanup:`
			`await lazy_cleanup.delete(f'{uuid}_mgmt')`
			`if queue:`
			`# queue shouldn't be none, but if it is, just ignore.`
			`await lazy_cleanup.zincrby('queues', -1, queue)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`to_capture: Dict[str, str] = await self.redis.hgetall(uuid)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`if get_config('generic', 'default_public'):`
			`# By default, the captures are on the index, unless the user mark them as un-listed`
			`listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True`
			`else:`
			`# By default, the captures are not on the index, unless the user mark them as listed`
			`listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False`

			`# Turn the freetext for the headers into a dict`
			`headers = {}`
			`if 'headers' in to_capture:`
			`for header_line in to_capture['headers'].splitlines():`
			`if header_line and ':' in header_line:`
			`splitted = header_line.split(':', 1)`
			`if splitted and len(splitted) == 2:`
			`header, h_value = splitted`
			`if header and h_value:`
			`headers[header.strip()] = h_value.strip()`

			`self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')`
			`self.thirdparty_submit(to_capture)`
			`success, error_message = await self._capture(`
			`to_capture['url'],`
			`perma_uuid=uuid,`
			`cookies_pseudofile=to_capture.get('cookies', None),`
			`listing=listing,`
			`user_agent=to_capture.get('user_agent', None),`
			`referer=to_capture.get('referer', None),`
			`headers=headers if headers else None,`
			`proxy=to_capture.get('proxy', None),`
			`os=to_capture.get('os', None),`
			`browser=to_capture.get('browser', None),`
			`parent=to_capture.get('parent', None)`
			`)`
			`if success:`
			`self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}')`
			`else:`
			`self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}: {error_message}')`
			`await lazy_cleanup.setex(f'error_{uuid}', 36000, f'{error_message} - {to_capture["url"]} - {uuid}')`
			`await lazy_cleanup.srem('ongoing', uuid)`
			`await lazy_cleanup.delete(uuid)`
			`# make sure to expire the key if nothing was processed for a while (= queues empty)`
			`await lazy_cleanup.expire('queues', 600)`
			`await lazy_cleanup.execute()`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,`
			`listing: bool=True, user_agent: Optional[str]=None,`
			`referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,`
			`proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,`
			`browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`'''Launch a capture'''`
			`url = url.strip()`
			`url = refang(url)`
			`if not url.startswith('http'):`
			`url = f'http://{url}'`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`splitted_url = urlsplit(url)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`if self.only_global_lookups:`
			`if splitted_url.netloc:`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`if splitted_url.hostname and splitted_url.hostname.split('.')[-1] != 'onion':`
			`try:`
			`ip = socket.gethostbyname(splitted_url.hostname)`
			`except socket.gaierror:`
			`self.logger.info('Name or service not known')`
			`return False, 'Name or service not known.'`
			`if not ipaddress.ip_address(ip).is_global:`
			`return False, 'Capturing ressources on private IPs is disabled.'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`else:`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`return False, 'Unable to find hostname or IP in the query.'`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`# check if onion`
			`if (not proxy and splitted_url.netloc and splitted_url.hostname`
			`and splitted_url.hostname.split('.')[-1] == 'onion'):`
			`proxy = get_config('generic', 'tor_proxy')`

chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`cookies = load_cookies(cookies_pseudofile)`
			`if not user_agent:`
			`# Catch case where the UA is broken on the UI, and the async submission.`
chg: Improve somewhat the useragents available for capturing Fix #416 2022-06-09 18:57:40 +02:00			`self.user_agents.user_agents # triggers an update if needed`
			`ua: str = self.user_agents.default['useragent']`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`else:`
			`ua = user_agent`

			`self.logger.info(f'Capturing {url}')`
			`try:`
chg: Improve capture, ignore ssl issues. 2022-04-26 13:49:24 +02:00			`async with Capture(proxy=proxy) as capture:`
			`capture.prepare_cookies(cookies)`
			`capture.user_agent = ua`
			`if headers:`
			`capture.http_headers = headers`
			`await capture.prepare_context()`
			`entries = await capture.capture_page(url, referer=referer)`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`except Exception as e:`
fix: issue in playwrightcapture module 2022-04-25 15:20:05 +02:00			`self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}')`
fix: better handling if capture fails. 2022-04-21 14:48:28 +02:00			`return False, f'Something went terribly wrong when capturing {url}.'`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`if not entries:`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`# broken`
			`self.logger.critical(f'Something went terribly wrong when capturing {url}.')`
new: Pass optional arbitrary HTTP headers to capture 2021-11-23 21:59:56 +01:00			`return False, f'Something went terribly wrong when capturing {url}.'`
chg: Improve storage, support both modes. 2021-08-26 15:49:19 +02:00			`now = datetime.now()`
			`dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()`
chg: Major refactoring, move capture code to external script. 2021-08-25 13:36:48 +02:00			`safe_create_dir(dirpath)`

			`if os or browser:`
			`meta = {}`
			`if os:`
			`meta['os'] = os`
			`if browser:`
			`meta['browser'] = browser`
			`with (dirpath / 'meta').open('w') as _meta:`
			`json.dump(meta, _meta)`

			`# Write UUID`
			`with (dirpath / 'uuid').open('w') as _uuid:`
			`_uuid.write(perma_uuid)`

			`# Write no_index marker (optional)`
			`if not listing:`
			`(dirpath / 'no_index').touch()`

			`# Write parent UUID (optional)`
			`if parent:`
			`with (dirpath / 'parent').open('w') as _parent:`
			`_parent.write(parent)`

new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`if 'error' in entries:`
			`with (dirpath / 'error.txt').open('w') as _error:`
			`json.dump(entries['error'], _error)`

new: Keep capture even if we have a network error 2022-05-03 12:23:16 +02:00			`if 'har' not in entries:`
			`return False, entries['error'] if entries['error'] else "Unknown error"`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00
			`with (dirpath / '0.har').open('w') as _har:`
new: Keep capture even if we have a network error 2022-05-03 12:23:16 +02:00			`json.dump(entries['har'], _har)`

			`if 'png' in entries and entries['png']:`
			`with (dirpath / '0.png').open('wb') as _img:`
			`_img.write(entries['png'])`

			`if 'html' in entries and entries['html']:`
			`with (dirpath / '0.html').open('w') as _html:`
			`_html.write(entries['html'])`

			`if 'last_redirected_url' in entries and entries['last_redirected_url']:`
			`with (dirpath / '0.last_redirect.txt').open('w') as _redir:`
			`_redir.write(entries['last_redirected_url'])`

			`if 'cookies' in entries and entries['cookies']:`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`with (dirpath / '0.cookies.json').open('w') as _cookies:`
chg: Use packaged playwright capture module 2022-04-25 13:34:01 +02:00			`json.dump(entries['cookies'], _cookies)`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`await self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))`
chg: Improve error message if the capture fails Fix #257 2021-09-07 18:15:56 +02:00			`return True, 'All good!'`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`async def _to_run_forever_async(self):`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)`
			`while await self.redis.exists('to_capture'):`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`await self.process_capture_queue()`
chg: Move the process managment methods to the proper class 2021-08-27 17:28:26 +02:00			`if self.shutdown_requested():`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`break`
chg: sunday cleanup 2022-05-23 00:15:52 +02:00			`await self.redis.close()`
new: Initial commit for client and async scraping 2019-01-29 18:37:13 +01:00

chg: Bump minimal version of poetry, bump deps, fix pyproject 2020-10-03 21:19:43 +02:00			`def main():`
fix: Rename scrape -> capture in async 2020-11-05 14:14:33 +01:00			`m = AsyncCapture()`
new: Switch away from splash to use playwright 2022-04-21 13:53:42 +02:00			`asyncio.run(m.run_async(sleep_in_sec=1))`
chg: Bump minimal version of poetry, bump deps, fix pyproject 2020-10-03 21:19:43 +02:00

			`if __name__ == '__main__':`
			`main()`