2019-01-29 18:37:13 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
import asyncio
|
2021-08-25 13:36:48 +02:00
|
|
|
import ipaddress
|
|
|
|
import json
|
2019-01-29 18:37:13 +01:00
|
|
|
import logging
|
2022-08-04 16:58:07 +02:00
|
|
|
import os
|
2021-08-25 13:36:48 +02:00
|
|
|
import socket
|
2022-08-04 16:58:07 +02:00
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
from datetime import datetime
|
2021-09-07 12:59:31 +02:00
|
|
|
from io import BufferedIOBase
|
2021-08-25 13:36:48 +02:00
|
|
|
from pathlib import Path
|
2022-08-04 16:58:07 +02:00
|
|
|
from tempfile import NamedTemporaryFile
|
2021-09-07 12:59:31 +02:00
|
|
|
from typing import Dict, List, Optional, Tuple, Union
|
2021-08-25 13:36:48 +02:00
|
|
|
from urllib.parse import urlsplit
|
|
|
|
|
|
|
|
from defang import refang # type: ignore
|
2022-05-23 00:15:52 +02:00
|
|
|
from redis.asyncio import Redis
|
2022-08-18 11:19:32 +02:00
|
|
|
from playwrightcapture import Capture, PlaywrightCaptureException
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2021-10-18 13:06:43 +02:00
|
|
|
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
|
2022-08-19 14:26:18 +02:00
|
|
|
from lookyloo.helpers import get_captures_dir, load_cookies, UserAgents, ParsedUserAgent
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2022-05-02 13:04:55 +02:00
|
|
|
from lookyloo.modules import FOX
|
|
|
|
|
2019-01-29 18:37:13 +01:00
|
|
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
2021-09-01 10:40:59 +02:00
|
|
|
level=logging.INFO)
|
2019-01-29 18:37:13 +01:00
|
|
|
|
|
|
|
|
2020-11-05 14:14:33 +01:00
|
|
|
class AsyncCapture(AbstractManager):
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2021-03-12 16:49:04 +01:00
|
|
|
def __init__(self, loglevel: int=logging.INFO):
|
2019-01-29 18:37:13 +01:00
|
|
|
super().__init__(loglevel)
|
2021-04-09 14:33:40 +02:00
|
|
|
self.script_name = 'async_capture'
|
2021-08-25 13:36:48 +02:00
|
|
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
|
|
|
self.capture_dir: Path = get_captures_dir()
|
2022-06-09 18:57:40 +02:00
|
|
|
self.user_agents = UserAgents()
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-05-02 13:04:55 +02:00
|
|
|
self.fox = FOX(get_config('modules', 'FOX'))
|
|
|
|
if not self.fox.available:
|
|
|
|
self.logger.warning('Unable to setup the FOX module')
|
|
|
|
|
2022-08-04 16:58:07 +02:00
|
|
|
def thirdparty_submit(self, url: str) -> None:
|
2022-05-02 13:04:55 +02:00
|
|
|
if self.fox.available:
|
2022-08-04 16:58:07 +02:00
|
|
|
self.fox.capture_default_trigger(url, auto_trigger=True)
|
2022-05-02 13:04:55 +02:00
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
async def process_capture_queue(self) -> None:
|
2021-08-25 13:36:48 +02:00
|
|
|
'''Process a query from the capture queue'''
|
2022-08-04 16:58:07 +02:00
|
|
|
value: List[Tuple[bytes, float]] = await self.redis.zpopmax('to_capture')
|
2021-08-25 13:36:48 +02:00
|
|
|
if not value or not value[0]:
|
2021-08-25 16:40:51 +02:00
|
|
|
# The queue was consumed by an other process.
|
|
|
|
return
|
2022-08-18 11:19:32 +02:00
|
|
|
uuid: str = value[0][0].decode()
|
|
|
|
queue: Optional[bytes] = await self.redis.getdel(f'{uuid}_mgmt')
|
2022-05-23 00:15:52 +02:00
|
|
|
await self.redis.sadd('ongoing', uuid)
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-08-18 11:19:32 +02:00
|
|
|
to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid)
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-08-18 11:19:32 +02:00
|
|
|
if get_config('generic', 'default_public'):
|
|
|
|
# By default, the captures are on the index, unless the user mark them as un-listed
|
|
|
|
listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True
|
|
|
|
else:
|
|
|
|
# By default, the captures are not on the index, unless the user mark them as listed
|
|
|
|
listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False
|
|
|
|
|
|
|
|
# Turn the freetext for the headers into a dict
|
|
|
|
headers: Dict[str, str] = {}
|
|
|
|
if b'headers' in to_capture:
|
|
|
|
for header_line in to_capture[b'headers'].decode().splitlines():
|
|
|
|
if header_line and ':' in header_line:
|
|
|
|
splitted = header_line.split(':', 1)
|
|
|
|
if splitted and len(splitted) == 2:
|
|
|
|
header, h_value = splitted
|
|
|
|
if header and h_value:
|
|
|
|
headers[header.strip()] = h_value.strip()
|
|
|
|
if to_capture.get(b'dnt'):
|
|
|
|
headers['DNT'] = to_capture[b'dnt'].decode()
|
|
|
|
|
|
|
|
if to_capture.get(b'document'):
|
|
|
|
# we do not have a URL yet.
|
|
|
|
document_name = Path(to_capture[b'document_name'].decode()).name
|
|
|
|
tmp_f = NamedTemporaryFile(suffix=document_name, delete=False)
|
|
|
|
with open(tmp_f.name, "wb") as f:
|
|
|
|
f.write(to_capture[b'document'])
|
|
|
|
url = f'file://{tmp_f.name}'
|
|
|
|
elif to_capture.get(b'url'):
|
|
|
|
url = to_capture[b'url'].decode()
|
|
|
|
self.thirdparty_submit(url)
|
|
|
|
else:
|
|
|
|
self.logger.warning(f'Invalid capture {to_capture}.')
|
2022-05-23 00:15:52 +02:00
|
|
|
|
2022-08-18 11:19:32 +02:00
|
|
|
if url:
|
2022-08-04 16:58:07 +02:00
|
|
|
self.logger.info(f'Capturing {url} - {uuid}')
|
2022-05-23 00:15:52 +02:00
|
|
|
success, error_message = await self._capture(
|
2022-08-04 16:58:07 +02:00
|
|
|
url,
|
2022-05-23 00:15:52 +02:00
|
|
|
perma_uuid=uuid,
|
2022-08-04 16:58:07 +02:00
|
|
|
cookies_pseudofile=to_capture.get(b'cookies', None),
|
2022-05-23 00:15:52 +02:00
|
|
|
listing=listing,
|
2022-08-04 16:58:07 +02:00
|
|
|
user_agent=to_capture[b'user_agent'].decode() if to_capture.get(b'user_agent') else None,
|
|
|
|
referer=to_capture[b'referer'].decode() if to_capture.get(b'referer') else None,
|
2022-05-23 00:15:52 +02:00
|
|
|
headers=headers if headers else None,
|
2022-08-04 16:58:07 +02:00
|
|
|
proxy=to_capture[b'proxy'].decode() if to_capture.get(b'proxy') else None,
|
|
|
|
os=to_capture[b'os'].decode() if to_capture.get(b'os') else None,
|
|
|
|
browser=to_capture[b'browser'].decode() if to_capture.get(b'browser') else None,
|
2022-08-18 11:19:32 +02:00
|
|
|
browser_engine=to_capture[b'browser_engine'].decode() if to_capture.get(b'browser_engine') else None,
|
|
|
|
device_name=to_capture[b'device_name'].decode() if to_capture.get(b'device_name') else None,
|
2022-08-04 16:58:07 +02:00
|
|
|
parent=to_capture[b'parent'].decode() if to_capture.get(b'parent') else None
|
2022-05-23 00:15:52 +02:00
|
|
|
)
|
2022-08-04 16:58:07 +02:00
|
|
|
|
|
|
|
if to_capture.get(b'document'):
|
|
|
|
os.unlink(tmp_f.name)
|
|
|
|
|
2022-05-23 00:15:52 +02:00
|
|
|
if success:
|
2022-08-04 16:58:07 +02:00
|
|
|
self.logger.info(f'Successfully captured {url} - {uuid}')
|
2022-05-23 00:15:52 +02:00
|
|
|
else:
|
2022-08-04 16:58:07 +02:00
|
|
|
self.logger.warning(f'Unable to capture {url} - {uuid}: {error_message}')
|
2022-08-18 11:19:32 +02:00
|
|
|
await self.redis.setex(f'error_{uuid}', 36000, f'{error_message} - {url} - {uuid}')
|
|
|
|
|
|
|
|
async with self.redis.pipeline() as lazy_cleanup:
|
|
|
|
if queue and await self.redis.zscore('queues', queue):
|
|
|
|
await lazy_cleanup.zincrby('queues', -1, queue)
|
2022-05-23 00:15:52 +02:00
|
|
|
await lazy_cleanup.srem('ongoing', uuid)
|
|
|
|
await lazy_cleanup.delete(uuid)
|
|
|
|
# make sure to expire the key if nothing was processed for a while (= queues empty)
|
|
|
|
await lazy_cleanup.expire('queues', 600)
|
|
|
|
await lazy_cleanup.execute()
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-08-04 16:58:07 +02:00
|
|
|
async def _capture(self, url: str, *, perma_uuid: str,
|
|
|
|
cookies_pseudofile: Optional[Union[BufferedIOBase, str, bytes]]=None,
|
2022-04-21 13:53:42 +02:00
|
|
|
listing: bool=True, user_agent: Optional[str]=None,
|
2022-08-03 12:07:41 +02:00
|
|
|
referer: Optional[str]=None,
|
|
|
|
headers: Optional[Dict[str, str]]=None,
|
2022-04-21 13:53:42 +02:00
|
|
|
proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
|
2022-08-18 11:19:32 +02:00
|
|
|
browser: Optional[str]=None, parent: Optional[str]=None,
|
|
|
|
browser_engine: Optional[str]=None,
|
|
|
|
device_name: Optional[str]=None,
|
|
|
|
viewport: Optional[Dict[str, int]]=None) -> Tuple[bool, str]:
|
2021-08-25 13:36:48 +02:00
|
|
|
'''Launch a capture'''
|
|
|
|
url = url.strip()
|
|
|
|
url = refang(url)
|
2022-08-04 16:58:07 +02:00
|
|
|
if not url.startswith('data') and not url.startswith('http') and not url.startswith('file'):
|
2021-08-25 13:36:48 +02:00
|
|
|
url = f'http://{url}'
|
2022-04-21 13:53:42 +02:00
|
|
|
splitted_url = urlsplit(url)
|
2021-08-25 13:36:48 +02:00
|
|
|
if self.only_global_lookups:
|
2022-08-04 21:28:47 +02:00
|
|
|
if url.startswith('data') or url.startswith('file'):
|
|
|
|
pass
|
|
|
|
elif splitted_url.netloc:
|
2022-05-23 00:15:52 +02:00
|
|
|
if splitted_url.hostname and splitted_url.hostname.split('.')[-1] != 'onion':
|
|
|
|
try:
|
|
|
|
ip = socket.gethostbyname(splitted_url.hostname)
|
|
|
|
except socket.gaierror:
|
|
|
|
self.logger.info('Name or service not known')
|
|
|
|
return False, 'Name or service not known.'
|
|
|
|
if not ipaddress.ip_address(ip).is_global:
|
|
|
|
return False, 'Capturing ressources on private IPs is disabled.'
|
2021-08-25 13:36:48 +02:00
|
|
|
else:
|
2021-09-07 18:15:56 +02:00
|
|
|
return False, 'Unable to find hostname or IP in the query.'
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
# check if onion
|
|
|
|
if (not proxy and splitted_url.netloc and splitted_url.hostname
|
|
|
|
and splitted_url.hostname.split('.')[-1] == 'onion'):
|
|
|
|
proxy = get_config('generic', 'tor_proxy')
|
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
if not user_agent:
|
|
|
|
# Catch case where the UA is broken on the UI, and the async submission.
|
2022-08-18 11:19:32 +02:00
|
|
|
self.user_agents.user_agents # triggers an update of the default UAs
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-08-19 14:26:18 +02:00
|
|
|
capture_ua = user_agent if user_agent else self.user_agents.default['useragent']
|
|
|
|
if not browser_engine:
|
|
|
|
# Automatically pick a browser
|
|
|
|
parsed_ua = ParsedUserAgent(capture_ua)
|
|
|
|
if not parsed_ua.browser:
|
|
|
|
browser_engine = 'webkit'
|
|
|
|
elif parsed_ua.browser.lower().startswith('chrom'):
|
|
|
|
browser_engine = 'chromium'
|
|
|
|
elif parsed_ua.browser.lower().startswith('firefox'):
|
|
|
|
browser_engine = 'firefox'
|
|
|
|
else:
|
|
|
|
browser_engine = 'webkit'
|
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
self.logger.info(f'Capturing {url}')
|
|
|
|
try:
|
2022-08-18 11:19:32 +02:00
|
|
|
async with Capture(browser=browser_engine, device_name=device_name, proxy=proxy) as capture:
|
2022-04-26 13:49:24 +02:00
|
|
|
if headers:
|
2022-08-18 11:19:32 +02:00
|
|
|
capture.headers = headers
|
|
|
|
if cookies_pseudofile:
|
|
|
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
|
|
|
capture.cookies = load_cookies(cookies_pseudofile) # type: ignore
|
|
|
|
if viewport:
|
|
|
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
|
|
|
capture.viewport = viewport # type: ignore
|
|
|
|
if not device_name:
|
2022-08-19 14:26:18 +02:00
|
|
|
capture.user_agent = capture_ua
|
2022-08-18 11:19:32 +02:00
|
|
|
await capture.initialize_context()
|
2022-04-26 13:49:24 +02:00
|
|
|
entries = await capture.capture_page(url, referer=referer)
|
2022-08-18 11:19:32 +02:00
|
|
|
except PlaywrightCaptureException as e:
|
|
|
|
self.logger.exception(f'Invalid parameters for the capture of {url} - {e}')
|
|
|
|
return False, 'Invalid parameters for the capture of {url} - {e}'
|
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
except Exception as e:
|
2022-04-25 15:20:05 +02:00
|
|
|
self.logger.exception(f'Something went terribly wrong when capturing {url} - {e}')
|
2022-04-21 14:48:28 +02:00
|
|
|
return False, f'Something went terribly wrong when capturing {url}.'
|
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
if not entries:
|
2021-08-25 13:36:48 +02:00
|
|
|
# broken
|
|
|
|
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
2021-11-23 21:59:56 +01:00
|
|
|
return False, f'Something went terribly wrong when capturing {url}.'
|
2021-08-26 15:49:19 +02:00
|
|
|
now = datetime.now()
|
|
|
|
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
|
2021-08-25 13:36:48 +02:00
|
|
|
safe_create_dir(dirpath)
|
|
|
|
|
|
|
|
if os or browser:
|
|
|
|
meta = {}
|
|
|
|
if os:
|
|
|
|
meta['os'] = os
|
|
|
|
if browser:
|
|
|
|
meta['browser'] = browser
|
|
|
|
with (dirpath / 'meta').open('w') as _meta:
|
|
|
|
json.dump(meta, _meta)
|
|
|
|
|
|
|
|
# Write UUID
|
|
|
|
with (dirpath / 'uuid').open('w') as _uuid:
|
|
|
|
_uuid.write(perma_uuid)
|
|
|
|
|
|
|
|
# Write no_index marker (optional)
|
|
|
|
if not listing:
|
|
|
|
(dirpath / 'no_index').touch()
|
|
|
|
|
|
|
|
# Write parent UUID (optional)
|
|
|
|
if parent:
|
|
|
|
with (dirpath / 'parent').open('w') as _parent:
|
|
|
|
_parent.write(parent)
|
|
|
|
|
2022-07-19 17:54:45 +02:00
|
|
|
if 'downloaded_filename' in entries and entries['downloaded_filename']:
|
2022-08-04 16:58:07 +02:00
|
|
|
with (dirpath / '0.data.filename').open('w') as _downloaded_filename:
|
2022-07-19 17:54:45 +02:00
|
|
|
_downloaded_filename.write(entries['downloaded_filename'])
|
2022-07-19 16:07:36 +02:00
|
|
|
|
2022-07-19 17:54:45 +02:00
|
|
|
if 'downloaded_file' in entries and entries['downloaded_file']:
|
2022-08-04 16:58:07 +02:00
|
|
|
with (dirpath / '0.data').open('wb') as _downloaded_file:
|
2022-07-19 17:54:45 +02:00
|
|
|
_downloaded_file.write(entries['downloaded_file'])
|
2022-07-19 16:07:36 +02:00
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
if 'error' in entries:
|
|
|
|
with (dirpath / 'error.txt').open('w') as _error:
|
|
|
|
json.dump(entries['error'], _error)
|
|
|
|
|
2022-05-03 12:23:16 +02:00
|
|
|
if 'har' not in entries:
|
|
|
|
return False, entries['error'] if entries['error'] else "Unknown error"
|
2022-04-21 13:53:42 +02:00
|
|
|
|
|
|
|
with (dirpath / '0.har').open('w') as _har:
|
2022-05-03 12:23:16 +02:00
|
|
|
json.dump(entries['har'], _har)
|
|
|
|
|
|
|
|
if 'png' in entries and entries['png']:
|
|
|
|
with (dirpath / '0.png').open('wb') as _img:
|
|
|
|
_img.write(entries['png'])
|
|
|
|
|
|
|
|
if 'html' in entries and entries['html']:
|
|
|
|
with (dirpath / '0.html').open('w') as _html:
|
|
|
|
_html.write(entries['html'])
|
|
|
|
|
|
|
|
if 'last_redirected_url' in entries and entries['last_redirected_url']:
|
|
|
|
with (dirpath / '0.last_redirect.txt').open('w') as _redir:
|
|
|
|
_redir.write(entries['last_redirected_url'])
|
|
|
|
|
|
|
|
if 'cookies' in entries and entries['cookies']:
|
2022-04-21 13:53:42 +02:00
|
|
|
with (dirpath / '0.cookies.json').open('w') as _cookies:
|
2022-04-25 13:34:01 +02:00
|
|
|
json.dump(entries['cookies'], _cookies)
|
2022-05-23 00:15:52 +02:00
|
|
|
await self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
2021-09-07 18:15:56 +02:00
|
|
|
return True, 'All good!'
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
async def _to_run_forever_async(self):
|
2022-08-04 16:58:07 +02:00
|
|
|
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'))
|
2022-05-23 00:15:52 +02:00
|
|
|
while await self.redis.exists('to_capture'):
|
2022-04-21 13:53:42 +02:00
|
|
|
await self.process_capture_queue()
|
2021-08-27 17:28:26 +02:00
|
|
|
if self.shutdown_requested():
|
2019-04-05 16:12:54 +02:00
|
|
|
break
|
2022-05-23 00:15:52 +02:00
|
|
|
await self.redis.close()
|
2019-01-29 18:37:13 +01:00
|
|
|
|
|
|
|
|
2020-10-03 21:19:43 +02:00
|
|
|
def main():
|
2020-11-05 14:14:33 +01:00
|
|
|
m = AsyncCapture()
|
2022-04-21 13:53:42 +02:00
|
|
|
asyncio.run(m.run_async(sleep_in_sec=1))
|
2020-10-03 21:19:43 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|