mirror of https://github.com/CIRCL/lookyloo
new: Switch away from splash to use playwright
parent
3839c34e39
commit
8d159ffba0
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import base64
|
import asyncio
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
@ -13,10 +13,10 @@ from urllib.parse import urlsplit
|
||||||
|
|
||||||
from defang import refang # type: ignore
|
from defang import refang # type: ignore
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
from scrapysplashwrapper import crawl
|
from playwrightcapture import Capture
|
||||||
|
|
||||||
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
|
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
|
||||||
from lookyloo.helpers import get_captures_dir, get_splash_url, load_cookies, splash_status
|
from lookyloo.helpers import get_captures_dir, load_cookies
|
||||||
|
|
||||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||||
level=logging.INFO)
|
level=logging.INFO)
|
||||||
|
@ -29,10 +29,9 @@ class AsyncCapture(AbstractManager):
|
||||||
self.script_name = 'async_capture'
|
self.script_name = 'async_capture'
|
||||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||||
self.capture_dir: Path = get_captures_dir()
|
self.capture_dir: Path = get_captures_dir()
|
||||||
self.splash_url: str = get_splash_url()
|
|
||||||
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
|
|
||||||
def process_capture_queue(self) -> None:
|
async def process_capture_queue(self) -> None:
|
||||||
'''Process a query from the capture queue'''
|
'''Process a query from the capture queue'''
|
||||||
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
|
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
|
||||||
if not value or not value[0]:
|
if not value or not value[0]:
|
||||||
|
@ -69,11 +68,10 @@ class AsyncCapture(AbstractManager):
|
||||||
headers[header.strip()] = h_value.strip()
|
headers[header.strip()] = h_value.strip()
|
||||||
|
|
||||||
self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
|
self.logger.info(f'Capturing {to_capture["url"]} - {uuid}')
|
||||||
success, error_message = self._capture(
|
success, error_message = await self._capture(
|
||||||
to_capture['url'],
|
to_capture['url'],
|
||||||
perma_uuid=uuid,
|
perma_uuid=uuid,
|
||||||
cookies_pseudofile=to_capture.get('cookies', None),
|
cookies_pseudofile=to_capture.get('cookies', None),
|
||||||
depth=int(to_capture.get('depth', 1)),
|
|
||||||
listing=listing,
|
listing=listing,
|
||||||
user_agent=to_capture.get('user_agent', None),
|
user_agent=to_capture.get('user_agent', None),
|
||||||
referer=to_capture.get('referer', None),
|
referer=to_capture.get('referer', None),
|
||||||
|
@ -94,17 +92,18 @@ class AsyncCapture(AbstractManager):
|
||||||
lazy_cleanup.expire('queues', 600)
|
lazy_cleanup.expire('queues', 600)
|
||||||
lazy_cleanup.execute()
|
lazy_cleanup.execute()
|
||||||
|
|
||||||
def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
async def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
listing: bool=True, user_agent: Optional[str]=None,
|
||||||
referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None, proxy: Optional[str]=None, os: Optional[str]=None,
|
referer: Optional[str]=None, headers: Optional[Dict[str, str]]=None,
|
||||||
|
proxy: Optional[Union[str, Dict]]=None, os: Optional[str]=None,
|
||||||
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
|
browser: Optional[str]=None, parent: Optional[str]=None) -> Tuple[bool, str]:
|
||||||
'''Launch a capture'''
|
'''Launch a capture'''
|
||||||
url = url.strip()
|
url = url.strip()
|
||||||
url = refang(url)
|
url = refang(url)
|
||||||
if not url.startswith('http'):
|
if not url.startswith('http'):
|
||||||
url = f'http://{url}'
|
url = f'http://{url}'
|
||||||
if self.only_global_lookups:
|
|
||||||
splitted_url = urlsplit(url)
|
splitted_url = urlsplit(url)
|
||||||
|
if self.only_global_lookups:
|
||||||
if splitted_url.netloc:
|
if splitted_url.netloc:
|
||||||
if splitted_url.hostname:
|
if splitted_url.hostname:
|
||||||
if splitted_url.hostname.split('.')[-1] != 'onion':
|
if splitted_url.hostname.split('.')[-1] != 'onion':
|
||||||
|
@ -118,6 +117,11 @@ class AsyncCapture(AbstractManager):
|
||||||
else:
|
else:
|
||||||
return False, 'Unable to find hostname or IP in the query.'
|
return False, 'Unable to find hostname or IP in the query.'
|
||||||
|
|
||||||
|
# check if onion
|
||||||
|
if (not proxy and splitted_url.netloc and splitted_url.hostname
|
||||||
|
and splitted_url.hostname.split('.')[-1] == 'onion'):
|
||||||
|
proxy = get_config('generic', 'tor_proxy')
|
||||||
|
|
||||||
cookies = load_cookies(cookies_pseudofile)
|
cookies = load_cookies(cookies_pseudofile)
|
||||||
if not user_agent:
|
if not user_agent:
|
||||||
# Catch case where the UA is broken on the UI, and the async submission.
|
# Catch case where the UA is broken on the UI, and the async submission.
|
||||||
|
@ -125,22 +129,26 @@ class AsyncCapture(AbstractManager):
|
||||||
else:
|
else:
|
||||||
ua = user_agent
|
ua = user_agent
|
||||||
|
|
||||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
|
||||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
|
||||||
depth = int(get_config('generic', 'max_depth'))
|
|
||||||
self.logger.info(f'Capturing {url}')
|
self.logger.info(f'Capturing {url}')
|
||||||
try:
|
try:
|
||||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
capture = Capture()
|
||||||
referer=referer, headers=headers, proxy=proxy, log_enabled=True,
|
if proxy:
|
||||||
log_level=get_config('generic', 'splash_loglevel'))
|
await capture.prepare_capture(proxy=proxy)
|
||||||
|
else:
|
||||||
|
await capture.prepare_capture()
|
||||||
|
capture.cookies = cookies
|
||||||
|
capture.user_agent = ua
|
||||||
|
if headers:
|
||||||
|
capture.http_headers = headers
|
||||||
|
await capture.prepare_context()
|
||||||
|
entries = await capture.capture_page(url, referer=referer)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||||
raise e
|
raise e
|
||||||
if not items:
|
if not entries:
|
||||||
# broken
|
# broken
|
||||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||||
return False, f'Something went terribly wrong when capturing {url}.'
|
return False, f'Something went terribly wrong when capturing {url}.'
|
||||||
width = len(str(len(items)))
|
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
|
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
|
||||||
safe_create_dir(dirpath)
|
safe_create_dir(dirpath)
|
||||||
|
@ -167,53 +175,42 @@ class AsyncCapture(AbstractManager):
|
||||||
with (dirpath / 'parent').open('w') as _parent:
|
with (dirpath / 'parent').open('w') as _parent:
|
||||||
_parent.write(parent)
|
_parent.write(parent)
|
||||||
|
|
||||||
for i, item in enumerate(items):
|
if 'error' in entries:
|
||||||
if 'error' in item:
|
|
||||||
with (dirpath / 'error.txt').open('w') as _error:
|
with (dirpath / 'error.txt').open('w') as _error:
|
||||||
json.dump(item['error'], _error)
|
json.dump(entries['error'], _error)
|
||||||
|
|
||||||
# The capture went fine
|
# The capture went fine
|
||||||
harfile = item['har']
|
harfile = entries['har']
|
||||||
png = base64.b64decode(item['png'])
|
png = entries['png']
|
||||||
html = item['html']
|
html = entries['html']
|
||||||
last_redirect = item['last_redirected_url']
|
last_redirect = entries['last_redirected_url']
|
||||||
|
|
||||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
with (dirpath / '0.har').open('w') as _har:
|
||||||
json.dump(harfile, _har)
|
json.dump(harfile, _har)
|
||||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
with (dirpath / '0.png').open('wb') as _img:
|
||||||
_img.write(png)
|
_img.write(png)
|
||||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
with (dirpath / '0.html').open('w') as _html:
|
||||||
_html.write(html)
|
_html.write(html)
|
||||||
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
|
with (dirpath / '0.last_redirect.txt').open('w') as _redir:
|
||||||
_redir.write(last_redirect)
|
_redir.write(last_redirect)
|
||||||
|
|
||||||
if 'childFrames' in item:
|
if 'cookies' in entries:
|
||||||
child_frames = item['childFrames']
|
cookies = entries['cookies']
|
||||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
|
with (dirpath / '0.cookies.json').open('w') as _cookies:
|
||||||
json.dump(child_frames, _iframes)
|
|
||||||
|
|
||||||
if 'cookies' in item:
|
|
||||||
cookies = item['cookies']
|
|
||||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
|
||||||
json.dump(cookies, _cookies)
|
json.dump(cookies, _cookies)
|
||||||
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||||
return True, 'All good!'
|
return True, 'All good!'
|
||||||
|
|
||||||
def _to_run_forever(self):
|
async def _to_run_forever_async(self):
|
||||||
while self.redis.exists('to_capture'):
|
while self.redis.exists('to_capture'):
|
||||||
status, message = splash_status()
|
await self.process_capture_queue()
|
||||||
if not status:
|
|
||||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
|
||||||
break
|
|
||||||
|
|
||||||
self.process_capture_queue()
|
|
||||||
if self.shutdown_requested():
|
if self.shutdown_requested():
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
m = AsyncCapture()
|
m = AsyncCapture()
|
||||||
m.run(sleep_in_sec=1)
|
asyncio.run(m.run_async(sleep_in_sec=1))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -25,6 +25,9 @@
|
||||||
"enable_bookmark": false,
|
"enable_bookmark": false,
|
||||||
"auto_trigger_modules": false,
|
"auto_trigger_modules": false,
|
||||||
"enable_mail_notification": false,
|
"enable_mail_notification": false,
|
||||||
|
"tor_proxy": {
|
||||||
|
"server": "socks5://127.0.0.1:9050"
|
||||||
|
},
|
||||||
"email": {
|
"email": {
|
||||||
"from": "Lookyloo <lookyloo@myorg.local>",
|
"from": "Lookyloo <lookyloo@myorg.local>",
|
||||||
"to": "Investigation Team <investigation_unit@myorg.local>",
|
"to": "Investigation Team <investigation_unit@myorg.local>",
|
||||||
|
|
|
@ -335,7 +335,7 @@ class Lookyloo():
|
||||||
self.logger.warning(e)
|
self.logger.warning(e)
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.critical(e)
|
self.logger.exception(e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -46,9 +46,7 @@ bootstrap-flask = "^2.0.2"
|
||||||
defang = "^0.5.3"
|
defang = "^0.5.3"
|
||||||
vt-py = "^0.14.0"
|
vt-py = "^0.14.0"
|
||||||
pyeupi = "^1.1"
|
pyeupi = "^1.1"
|
||||||
scrapysplashwrapper = "^1.11.0"
|
|
||||||
pysanejs = "^2.0"
|
pysanejs = "^2.0"
|
||||||
har2tree = "^1.11.0"
|
|
||||||
pylookyloo = "^1.10.0"
|
pylookyloo = "^1.10.0"
|
||||||
dnspython = "^2.2.1"
|
dnspython = "^2.2.1"
|
||||||
pytaxonomies = "^1.4.1"
|
pytaxonomies = "^1.4.1"
|
||||||
|
@ -66,6 +64,8 @@ pyhashlookup = "^1.1.1"
|
||||||
lief = "^0.12.1"
|
lief = "^0.12.1"
|
||||||
ua-parser = "^0.10.0"
|
ua-parser = "^0.10.0"
|
||||||
Flask-Login = "^0.6.0"
|
Flask-Login = "^0.6.0"
|
||||||
|
playwrightcapture = {git = "https://github.com/Lookyloo/PlaywrightCapture.git", rev = "main"}
|
||||||
|
har2tree = {git = "https://github.com/Lookyloo/har2tree.git", rev = "main"}
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
misp = ['python-magic', 'pydeep']
|
misp = ['python-magic', 'pydeep']
|
||||||
|
|
|
@ -24,7 +24,7 @@ from werkzeug.security import check_password_hash
|
||||||
from lookyloo.default import get_config
|
from lookyloo.default import get_config
|
||||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||||
from lookyloo.helpers import (CaptureStatus, get_taxonomies,
|
from lookyloo.helpers import (CaptureStatus, get_taxonomies,
|
||||||
get_user_agents, load_cookies, splash_status)
|
get_user_agents, load_cookies)
|
||||||
from lookyloo.lookyloo import Indexing, Lookyloo
|
from lookyloo.lookyloo import Indexing, Lookyloo
|
||||||
|
|
||||||
from .genericapi import api as generic_api
|
from .genericapi import api as generic_api
|
||||||
|
@ -600,10 +600,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
||||||
cache = lookyloo.capture_cache(tree_uuid)
|
cache = lookyloo.capture_cache(tree_uuid)
|
||||||
if not cache:
|
if not cache:
|
||||||
status = lookyloo.get_capture_status(tree_uuid)
|
status = lookyloo.get_capture_status(tree_uuid)
|
||||||
splash_up, splash_message = splash_status()
|
|
||||||
if not splash_up:
|
|
||||||
flash(f'The capture module is not reachable ({splash_message}).', 'error')
|
|
||||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
|
||||||
if status == CaptureStatus.UNKNOWN:
|
if status == CaptureStatus.UNKNOWN:
|
||||||
error = lookyloo.try_error_status(tree_uuid)
|
error = lookyloo.try_error_status(tree_uuid)
|
||||||
if error:
|
if error:
|
||||||
|
@ -810,10 +806,6 @@ def _prepare_capture_template(user_ua: Optional[str], predefined_url: Optional[s
|
||||||
if 'bot' not in ua['useragent'].lower():
|
if 'bot' not in ua['useragent'].lower():
|
||||||
default_ua = ua
|
default_ua = ua
|
||||||
break
|
break
|
||||||
splash_up, message = splash_status()
|
|
||||||
if not splash_up:
|
|
||||||
flash(f'The capture module is not reachable ({message}).', 'error')
|
|
||||||
flash('The request will be enqueued, but capturing may take a while and require the administrator to wake up.', 'error')
|
|
||||||
return render_template('capture.html', user_agents=user_agents, default=default_ua,
|
return render_template('capture.html', user_agents=user_agents, default=default_ua,
|
||||||
max_depth=max_depth, personal_ua=user_ua,
|
max_depth=max_depth, personal_ua=user_ua,
|
||||||
default_public=get_config('generic', 'default_public'),
|
default_public=get_config('generic', 'default_public'),
|
||||||
|
|
Loading…
Reference in New Issue