mirror of https://github.com/CIRCL/lookyloo
new: Switch away from splash to use playwright
parent
3839c34e39
commit
8d159ffba0
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import base64
|
||||
import asyncio
|
||||
import ipaddress
|
||||
import json
|
||||
import logging
|
||||
|
@ -13,10 +13,10 @@ from urllib.parse import urlsplit
|
|||
|
||||
from defang import refang # type: ignore
|
||||
from redis import Redis
|
||||
from scrapysplashwrapper import crawl
|
||||
from playwrightcapture import Capture
|
||||
|
||||
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
|
||||
from lookyloo.helpers import get_captures_dir, get_splash_url, load_cookies, splash_status
|
||||
from lookyloo.helpers import get_captures_dir, load_cookies
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
level=logging.INFO)
|
||||
|
@ -29,10 +29,9 @@ class AsyncCapture(AbstractManager):
|
|||
self.script_name = 'async_capture'
|
||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||
self.capture_dir: Path = get_captures_dir()
|
||||
self.splash_url: str = get_splash_url()
|
||||
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
|
||||
def process_capture_queue(self) -> None:
|
||||
async def process_capture_queue(self) -> None:
|
||||
'''Process a query from the capture queue'''
|
||||
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
|
||||
if not value or not value[0]:
|
||||
|
@ -69,11 +68,10 @@ class AsyncCapture(AbstractManager):
|
|||
headers[header.strip()] = h_value.strip()
|
||||
|
||||
self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
|
||||
success, error_message = self._capture(
|
||||
success, error_message = await self._capture(
|
||||
to_capture['url'],
|
||||
perma_uuid=uuid,
|
||||
cookies_pseudofile=to_capture.get('cookies', None),
|
||||
depth=int(to_capture.get('depth', 1)),
|
||||
listing=listing,
|
||||
user_agent=to_capture.get('user_agent', None),
|
||||
referer=to_capture.get('referer', None),
|
||||
|
@ -94,17 +92,18 @@ class AsyncCapture(AbstractManager):
|
|||
lazy_cleanup.expire('queues', 600)
|
||||
lazy_cleanup.execute()
|
||||
|
||||
def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||
referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None, proxy: Optional[str]=None, os: Optional[str]=None,
|
||||
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
|
||||
async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
listing: bool=True, user_agent: Optional[str]=None,
|
||||
referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,
|
||||
proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
|
||||
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
|
||||
'''Launch a capture'''
|
||||
url = url.strip()
|
||||
url = refang(url)
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
splitted_url = urlsplit(url)
|
||||
if self.only_global_lookups:
|
||||
splitted_url = urlsplit(url)
|
||||
if splitted_url.netloc:
|
||||
if splitted_url.hostname:
|
||||
if splitted_url.hostname.split('.')[-1] != 'onion':
|
||||
|
@ -118,6 +117,11 @@ class AsyncCapture(AbstractManager):
|
|||
else:
|
||||
return False, 'Unable to find hostname or IP in the query.'
|
||||
|
||||
# check if onion
|
||||
if (not proxy and splitted_url.netloc and splitted_url.hostname
|
||||
and splitted_url.hostname.split('.')[-1] == 'onion'):
|
||||
proxy = get_config('generic', 'tor_proxy')
|
||||
|
||||
cookies = load_cookies(cookies_pseudofile)
|
||||
if not user_agent:
|
||||
# Catch case where the UA is broken on the UI, and the async submission.
|
||||
|
@ -125,22 +129,26 @@ class AsyncCapture(AbstractManager):
|
|||
else:
|
||||
ua = user_agent
|
||||
|
||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||
depth = int(get_config('generic', 'max_depth'))
|
||||
self.logger.info(f'Capturing {url}')
|
||||
try:
|
||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||
referer=referer, headers=headers, proxy=proxy, log_enabled=True,
|
||||
log_level=get_config('generic', 'splash_loglevel'))
|
||||
capture = Capture()
|
||||
if proxy:
|
||||
await capture.prepare_capture(proxy=proxy)
|
||||
else:
|
||||
await capture.prepare_capture()
|
||||
capture.cookies = cookies
|
||||
capture.user_agent = ua
|
||||
if headers:
|
||||
capture.http_headers = headers
|
||||
await capture.prepare_context()
|
||||
entries = await capture.capture_page(url, referer=referer)
|
||||
except Exception as e:
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
raise e
|
||||
if not items:
|
||||
if not entries:
|
||||
# broken
|
||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
return False, f'Something went terribly wrong when capturing {url}.'
|
||||
width = len(str(len(items)))
|
||||
now = datetime.now()
|
||||
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
|
||||
safe_create_dir(dirpath)
|
||||
|
@ -167,53 +175,42 @@ class AsyncCapture(AbstractManager):
|
|||
with (dirpath / 'parent').open('w') as _parent:
|
||||
_parent.write(parent)
|
||||
|
||||
for i, item in enumerate(items):
|
||||
if 'error' in item:
|
||||
with (dirpath / 'error.txt').open('w') as _error:
|
||||
json.dump(item['error'], _error)
|
||||
if 'error' in entries:
|
||||
with (dirpath / 'error.txt').open('w') as _error:
|
||||
json.dump(entries['error'], _error)
|
||||
|
||||
# The capture went fine
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
html = item['html']
|
||||
last_redirect = item['last_redirected_url']
|
||||
# The capture went fine
|
||||
harfile = entries['har']
|
||||
png = entries['png']
|
||||
html = entries['html']
|
||||
last_redirect = entries['last_redirected_url']
|
||||
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
||||
json.dump(harfile, _har)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
||||
_img.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
||||
_html.write(html)
|
||||
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
|
||||
_redir.write(last_redirect)
|
||||
with (dirpath / '0.har').open('w') as _har:
|
||||
json.dump(harfile, _har)
|
||||
with (dirpath / '0.png').open('wb') as _img:
|
||||
_img.write(png)
|
||||
with (dirpath / '0.html').open('w') as _html:
|
||||
_html.write(html)
|
||||
with (dirpath / '0.last_redirect.txt').open('w') as _redir:
|
||||
_redir.write(last_redirect)
|
||||
|
||||
if 'childFrames' in item:
|
||||
child_frames = item['childFrames']
|
||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
|
||||
json.dump(child_frames, _iframes)
|
||||
|
||||
if 'cookies' in item:
|
||||
cookies = item['cookies']
|
||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
if 'cookies' in entries:
|
||||
cookies = entries['cookies']
|
||||
with (dirpath / '0.cookies.json').open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||
return True, 'All good!'
|
||||
|
||||
def _to_run_forever(self):
|
||||
async def _to_run_forever_async(self):
|
||||
while self.redis.exists('to_capture'):
|
||||
status, message = splash_status()
|
||||
if not status:
|
||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||
break
|
||||
|
||||
self.process_capture_queue()
|
||||
await self.process_capture_queue()
|
||||
if self.shutdown_requested():
|
||||
break
|
||||
|
||||
|
||||
def main():
|
||||
m = AsyncCapture()
|
||||
m.run(sleep_in_sec=1)
|
||||
asyncio.run(m.run_async(sleep_in_sec=1))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -25,6 +25,9 @@
|
|||
"enable_bookmark": false,
|
||||
"auto_trigger_modules": false,
|
||||
"enable_mail_notification": false,
|
||||
"tor_proxy": {
|
||||
"server": "socks5://127.0.0.1:9050"
|
||||
},
|
||||
"email": {
|
||||
"from": "Lookyloo <lookyloo@myorg.local>",
|
||||
"to": "Investigation Team <investigation_unit@myorg.local>",
|
||||
|
|
|
@ -335,7 +335,7 @@ class Lookyloo():
|
|||
self.logger.warning(e)
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.critical(e)
|
||||
self.logger.exception(e)
|
||||
return None
|
||||
|
||||
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -46,9 +46,7 @@ bootstrap-flask = "^2.0.2"
|
|||
defang = "^0.5.3"
|
||||
vt-py = "^0.14.0"
|
||||
pyeupi = "^1.1"
|
||||
scrapysplashwrapper = "^1.11.0"
|
||||
pysanejs = "^2.0"
|
||||
har2tree = "^1.11.0"
|
||||
pylookyloo = "^1.10.0"
|
||||
dnspython = "^2.2.1"
|
||||
pytaxonomies = "^1.4.1"
|
||||
|
@ -66,6 +64,8 @@ pyhashlookup = "^1.1.1"
|
|||
lief = "^0.12.1"
|
||||
ua-parser = "^0.10.0"
|
||||
Flask-Login = "^0.6.0"
|
||||
playwrightcapture = {git = "https://github.com/Lookyloo/PlaywrightCapture.git", rev = "main"}
|
||||
har2tree = {git = "https://github.com/Lookyloo/har2tree.git", rev = "main"}
|
||||
|
||||
[tool.poetry.extras]
|
||||
misp = ['python-magic', 'pydeep']
|
||||
|
|
|
@ -24,7 +24,7 @@ from werkzeug.security import check_password_hash
|
|||
from lookyloo.default import get_config
|
||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||
from lookyloo.helpers import (CaptureStatus, get_taxonomies,
|
||||
get_user_agents, load_cookies, splash_status)
|
||||
get_user_agents, load_cookies)
|
||||
from lookyloo.lookyloo import Indexing, Lookyloo
|
||||
|
||||
from .genericapi import api as generic_api
|
||||
|
@ -600,10 +600,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
|||
cache = lookyloo.capture_cache(tree_uuid)
|
||||
if not cache:
|
||||
status = lookyloo.get_capture_status(tree_uuid)
|
||||
splash_up, splash_message = splash_status()
|
||||
if not splash_up:
|
||||
flash(f'The capture module is not reachable ({splash_message}).', 'error')
|
||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||
if status == CaptureStatus.UNKNOWN:
|
||||
error = lookyloo.try_error_status(tree_uuid)
|
||||
if error:
|
||||
|
@ -810,10 +806,6 @@ def _prepare_capture_template(user_ua: Optional[str], predefined_url: Optional[s
|
|||
if 'bot' not in ua['useragent'].lower():
|
||||
default_ua = ua
|
||||
break
|
||||
splash_up, message = splash_status()
|
||||
if not splash_up:
|
||||
flash(f'The capture module is not reachable ({message}).', 'error')
|
||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
||||
return render_template('capture.html', user_agents=user_agents, default=default_ua,
|
||||
max_depth=max_depth, personal_ua=user_ua,
|
||||
default_public=get_config('generic', 'default_public'),
|
||||
|
|
Loading…
Reference in New Issue