new: Switch away from splash to use playwright

playwright
Raphaël Vinot 2022-04-21 14:53:42 +03:00
parent 3839c34e39
commit 8d159ffba0
6 changed files with 245 additions and 707 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3
import base64
import asyncio
import ipaddress
import json
import logging
@ -13,10 +13,10 @@ from urllib.parse import urlsplit
from defang import refang # type: ignore
from redis import Redis
from scrapysplashwrapper import crawl
from playwrightcapture import Capture
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
from lookyloo.helpers import get_captures_dir, get_splash_url, load_cookies, splash_status
from lookyloo.helpers import get_captures_dir, load_cookies
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO)
@ -29,10 +29,9 @@ class AsyncCapture(AbstractManager):
self.script_name = 'async_capture'
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
self.capture_dir: Path = get_captures_dir()
self.splash_url: str = get_splash_url()
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def process_capture_queue(self) -> None:
async def process_capture_queue(self) -> None:
'''Process a query from the capture queue'''
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
if not value or not value[0]:
@ -69,11 +68,10 @@ class AsyncCapture(AbstractManager):
headers[header.strip()] = h_value.strip()
self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
success, error_message = self._capture(
success, error_message = await self._capture(
to_capture['url'],
perma_uuid=uuid,
cookies_pseudofile=to_capture.get('cookies', None),
depth=int(to_capture.get('depth', 1)),
listing=listing,
user_agent=to_capture.get('user_agent', None),
referer=to_capture.get('referer', None),
@ -94,17 +92,18 @@ class AsyncCapture(AbstractManager):
lazy_cleanup.expire('queues', 600)
lazy_cleanup.execute()
def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None, proxy: Optional[str]=None, os: Optional[str]=None,
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
listing: bool=True, user_agent: Optional[str]=None,
referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,
proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
'''Launch a capture'''
url = url.strip()
url = refang(url)
if not url.startswith('http'):
url = f'http://{url}'
splitted_url = urlsplit(url)
if self.only_global_lookups:
splitted_url = urlsplit(url)
if splitted_url.netloc:
if splitted_url.hostname:
if splitted_url.hostname.split('.')[-1] != 'onion':
@ -118,6 +117,11 @@ class AsyncCapture(AbstractManager):
else:
return False, 'Unable to find hostname or IP in the query.'
# check if onion
if (not proxy and splitted_url.netloc and splitted_url.hostname
and splitted_url.hostname.split('.')[-1] == 'onion'):
proxy = get_config('generic', 'tor_proxy')
cookies = load_cookies(cookies_pseudofile)
if not user_agent:
# Catch case where the UA is broken on the UI, and the async submission.
@ -125,22 +129,26 @@ class AsyncCapture(AbstractManager):
else:
ua = user_agent
if int(depth) > int(get_config('generic', 'max_depth')):
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
depth = int(get_config('generic', 'max_depth'))
self.logger.info(f'Capturing {url}')
try:
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
referer=referer, headers=headers, proxy=proxy, log_enabled=True,
log_level=get_config('generic', 'splash_loglevel'))
capture = Capture()
if proxy:
await capture.prepare_capture(proxy=proxy)
else:
await capture.prepare_capture()
capture.cookies = cookies
capture.user_agent = ua
if headers:
capture.http_headers = headers
await capture.prepare_context()
entries = await capture.capture_page(url, referer=referer)
except Exception as e:
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
raise e
if not items:
if not entries:
# broken
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
return False, f'Something went terribly wrong when capturing {url}.'
width = len(str(len(items)))
now = datetime.now()
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
safe_create_dir(dirpath)
@ -167,53 +175,42 @@ class AsyncCapture(AbstractManager):
with (dirpath / 'parent').open('w') as _parent:
_parent.write(parent)
for i, item in enumerate(items):
if 'error' in item:
with (dirpath / 'error.txt').open('w') as _error:
json.dump(item['error'], _error)
if 'error' in entries:
with (dirpath / 'error.txt').open('w') as _error:
json.dump(entries['error'], _error)
# The capture went fine
harfile = item['har']
png = base64.b64decode(item['png'])
html = item['html']
last_redirect = item['last_redirected_url']
# The capture went fine
harfile = entries['har']
png = entries['png']
html = entries['html']
last_redirect = entries['last_redirected_url']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
json.dump(harfile, _har)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
_img.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
_html.write(html)
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
_redir.write(last_redirect)
with (dirpath / '0.har').open('w') as _har:
json.dump(harfile, _har)
with (dirpath / '0.png').open('wb') as _img:
_img.write(png)
with (dirpath / '0.html').open('w') as _html:
_html.write(html)
with (dirpath / '0.last_redirect.txt').open('w') as _redir:
_redir.write(last_redirect)
if 'childFrames' in item:
child_frames = item['childFrames']
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
json.dump(child_frames, _iframes)
if 'cookies' in item:
cookies = item['cookies']
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies)
if 'cookies' in entries:
cookies = entries['cookies']
with (dirpath / '0.cookies.json').open('w') as _cookies:
json.dump(cookies, _cookies)
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
return True, 'All good!'
def _to_run_forever(self):
async def _to_run_forever_async(self):
while self.redis.exists('to_capture'):
status, message = splash_status()
if not status:
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
break
self.process_capture_queue()
await self.process_capture_queue()
if self.shutdown_requested():
break
def main():
m = AsyncCapture()
m.run(sleep_in_sec=1)
asyncio.run(m.run_async(sleep_in_sec=1))
if __name__ == '__main__':

View File

@ -25,6 +25,9 @@
"enable_bookmark": false,
"auto_trigger_modules": false,
"enable_mail_notification": false,
"tor_proxy": {
"server": "socks5://127.0.0.1:9050"
},
"email": {
"from": "Lookyloo <lookyloo@myorg.local>",
"to": "Investigation Team <investigation_unit@myorg.local>",

View File

@ -335,7 +335,7 @@ class Lookyloo():
self.logger.warning(e)
return None
except Exception as e:
self.logger.critical(e)
self.logger.exception(e)
return None
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:

828
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -46,9 +46,7 @@ bootstrap-flask = "^2.0.2"
defang = "^0.5.3"
vt-py = "^0.14.0"
pyeupi = "^1.1"
scrapysplashwrapper = "^1.11.0"
pysanejs = "^2.0"
har2tree = "^1.11.0"
pylookyloo = "^1.10.0"
dnspython = "^2.2.1"
pytaxonomies = "^1.4.1"
@ -66,6 +64,8 @@ pyhashlookup = "^1.1.1"
lief = "^0.12.1"
ua-parser = "^0.10.0"
Flask-Login = "^0.6.0"
playwrightcapture = {git = "https://github.com/Lookyloo/PlaywrightCapture.git", rev = "main"}
har2tree = {git = "https://github.com/Lookyloo/har2tree.git", rev = "main"}
[tool.poetry.extras]
misp = ['python-magic', 'pydeep']

View File

@ -24,7 +24,7 @@ from werkzeug.security import check_password_hash
from lookyloo.default import get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.helpers import (CaptureStatus, get_taxonomies,
get_user_agents, load_cookies, splash_status)
get_user_agents, load_cookies)
from lookyloo.lookyloo import Indexing, Lookyloo
from .genericapi import api as generic_api
@ -600,10 +600,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
cache = lookyloo.capture_cache(tree_uuid)
if not cache:
status = lookyloo.get_capture_status(tree_uuid)
splash_up, splash_message = splash_status()
if not splash_up:
flash(f'The capture module is not reachable ({splash_message}).', 'error')
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
if status == CaptureStatus.UNKNOWN:
error = lookyloo.try_error_status(tree_uuid)
if error:
@ -810,10 +806,6 @@ def _prepare_capture_template(user_ua: Optional[str], predefined_url: Optional[s
if 'bot' not in ua['useragent'].lower():
default_ua = ua
break
splash_up, message = splash_status()
if not splash_up:
flash(f'The capture module is not reachable ({message}).', 'error')
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
return render_template('capture.html', user_agents=user_agents, default=default_ua,
max_depth=max_depth, personal_ua=user_ua,
default_public=get_config('generic', 'default_public'),