chg: Improve errors handling, especially for broken CaptureSettings

pull/926/head
Raphaël Vinot 2024-07-22 13:14:21 +02:00
parent dad0917e4c
commit d5fb385add
5 changed files with 96 additions and 47 deletions

View File

@ -56,7 +56,7 @@ class AsyncCapture(AbstractManager):
'''Get the list of captures ready to be processed'''
# Only check if the top 50 in the priority list are done, as they are the most likely ones to be
# and if the list it very very long, iterating over it takes a very long time.
return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf', start=0, num=50)
return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf', start=0, num=500)
if uuid and self.lookyloo.lacus.get_capture_status(uuid) in [CaptureStatusPy.DONE, CaptureStatusCore.DONE]]
def process_capture_queue(self) -> None:

View File

@ -10,7 +10,7 @@ from collections import Counter
from datetime import date, timedelta
from typing import Any
from lacuscore import CaptureStatus as CaptureStatusCore
from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError
from lookyloo import Lookyloo
from lookyloo.exceptions import LacusUnreachable
from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir
@ -109,41 +109,49 @@ class Processing(AbstractManager):
continue
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')
# This capture couldn't be queued and we created the uuid locally
if query := self.lookyloo.get_capture_settings(uuid):
try:
new_uuid = self.lookyloo.lacus.enqueue(
url=query.url,
document_name=query.document_name,
document=query.document,
# depth=query.depth,
browser=query.browser,
device_name=query.device_name,
user_agent=query.user_agent,
proxy=query.proxy,
general_timeout_in_sec=query.general_timeout_in_sec,
cookies=query.cookies,
headers=query.headers,
http_credentials=query.http_credentials,
viewport=query.viewport,
referer=query.referer,
rendered_hostname_only=query.rendered_hostname_only,
# force=query.force,
# recapture_interval=query.recapture_interval,
priority=query.priority,
uuid=uuid
)
if new_uuid != uuid:
# somehow, between the check and queuing, the UUID isn't UNKNOWN anymore, just checking that
self.logger.warning(f'Had to change the capture UUID (duplicate). Old: {uuid} / New: {new_uuid}')
except LacusUnreachable:
self.logger.warning('Lacus still unreachable.')
break
except Exception as e:
self.logger.warning(f'Still unable to enqueue capture: {e}')
break
else:
self.lookyloo.redis.hdel(uuid, 'not_queued')
self.logger.info(f'{uuid} enqueued.')
try:
if query := self.lookyloo.get_capture_settings(uuid):
try:
new_uuid = self.lookyloo.lacus.enqueue(
url=query.url,
document_name=query.document_name,
document=query.document,
# depth=query.depth,
browser=query.browser,
device_name=query.device_name,
user_agent=query.user_agent,
proxy=query.proxy,
general_timeout_in_sec=query.general_timeout_in_sec,
cookies=query.cookies,
headers=query.headers,
http_credentials=query.http_credentials,
viewport=query.viewport,
referer=query.referer,
rendered_hostname_only=query.rendered_hostname_only,
# force=query.force,
# recapture_interval=query.recapture_interval,
priority=query.priority,
uuid=uuid
)
if new_uuid != uuid:
# somehow, between the check and queuing, the UUID isn't UNKNOWN anymore, just checking that
self.logger.warning(f'Had to change the capture UUID (duplicate). Old: {uuid} / New: {new_uuid}')
except LacusUnreachable:
self.logger.warning('Lacus still unreachable.')
break
except Exception as e:
self.logger.warning(f'Still unable to enqueue capture: {e}')
break
else:
self.lookyloo.redis.hdel(uuid, 'not_queued')
self.logger.info(f'{uuid} enqueued.')
except CaptureSettingsError as e:
self.logger.error(f'Broken settings for {uuid} made their way in the cache, removing them: {e}')
self.lookyloo.redis.zrem('to_capture', uuid)
self.lookyloo.redis.delete(uuid)
except Exception as e:
self.logger.error(f'Unable to requeue {uuid}: {e}')
def main() -> None:

View File

@ -29,7 +29,7 @@ import mmh3
from defang import defang # type: ignore[import-untyped]
from har2tree import CrawledTree, HostNode, URLNode
from lacuscore import (LacusCore,
from lacuscore import (LacusCore, CaptureSettingsError,
CaptureStatus as CaptureStatusCore,
# CaptureResponse as CaptureResponseCore)
# CaptureResponseJson as CaptureResponseJsonCore,
@ -287,15 +287,25 @@ class Lookyloo():
return meta
def get_capture_settings(self, capture_uuid: str, /) -> CaptureSettings | None:
if capture_settings := self.redis.hgetall(capture_uuid):
return CaptureSettings(**capture_settings)
'''Get the capture settings from the cache or the disk.'''
try:
if capture_settings := self.redis.hgetall(capture_uuid):
return CaptureSettings(**capture_settings)
except CaptureSettingsError as e:
self.logger.warning(f'Invalid capture settings for {capture_uuid}: {e}')
return None
cache = self.capture_cache(capture_uuid)
if not cache:
return None
cs_file = cache.capture_dir / 'capture_settings.json'
if cs_file.exists():
with cs_file.open('r') as f:
return CaptureSettings(**json.load(f))
try:
with cs_file.open('r') as f:
return CaptureSettings(**json.load(f))
except CaptureSettingsError as e:
self.logger.warning(f'[In file!] Invalid capture settings for {capture_uuid}: {e}')
return None
return None
def categories_capture(self, capture_uuid: str, /) -> dict[str, Any]:
@ -650,7 +660,11 @@ class Lookyloo():
query.headers['dnt'] = query.dnt
if authenticated:
if user_config := load_user_config(user):
query = self._apply_user_config(query, user_config)
try:
query = self._apply_user_config(query, user_config)
except CaptureSettingsError as e:
self.logger.critical(f'Unable to apply user config for {user}: {e}')
raise e
priority = get_priority(source, user, authenticated)
if priority < -100:
@ -714,7 +728,11 @@ class Lookyloo():
if to_return['contacts']:
to_return['all_emails'] |= set(to_return['contacts'])
to_return['ips'] = {ip: self.uwhois.whois(ip, contact_email_only=True) for ip in set(hostnode.resolved_ips['v4']) | set(hostnode.resolved_ips['v6'])}
to_return['asns'] = {asn['asn']: self.uwhois.whois(f'AS{asn["asn"]}', contact_email_only=True) for asn in hostnode.ipasn.values()}
if hasattr(hostnode, 'ipasn'):
to_return['asns'] = {asn['asn']: self.uwhois.whois(f'AS{asn["asn"]}', contact_email_only=True) for asn in hostnode.ipasn.values()}
else:
self.logger.warning(f'No IPASN for {hostnode.name}')
to_return['asns'] = {}
# try to get contact from security.txt file
try:
@ -1429,7 +1447,11 @@ class Lookyloo():
error = lookyloo_capture.read(filename).decode()
elif filename.endswith('capture_settings.json'):
_capture_settings = json.loads(lookyloo_capture.read(filename))
capture_settings = CaptureSettings(**_capture_settings)
try:
capture_settings = CaptureSettings(**_capture_settings)
except CaptureSettingsError as e:
unrecoverable_error = True
messages['errors'].append(f'Invalid Capture Settings: {e}')
else:
for to_skip in files_to_skip:
if filename.endswith(to_skip):

View File

@ -32,7 +32,7 @@ from flask import (Flask, Response, Request, flash, jsonify, redirect, render_te
from flask_bootstrap import Bootstrap5 # type: ignore[import-untyped]
from flask_cors import CORS # type: ignore[import-untyped]
from flask_restx import Api # type: ignore[import-untyped]
from lacuscore import CaptureStatus
from lacuscore import CaptureStatus, CaptureSettingsError
from puremagic import from_string
from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined]
from werkzeug.security import check_password_hash
@ -283,6 +283,16 @@ def file_response(func): # type: ignore[no-untyped-def]
return wrapper
@app.errorhandler(CaptureSettingsError)
def handle_pydandic_validation_exception(error: CaptureSettingsError) -> Response | str | WerkzeugResponse:
'''Return the validation error message and 400 status code'''
if error.pydantic_validation_errors:
flash(f'Unable to validate capture settings: {error.pydantic_validation_errors.errors()}')
else:
flash(str(error))
return redirect(url_for('landing_page'))
# ##### Methods querying the indexes #####
def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]:

View File

@ -17,7 +17,7 @@ from flask import request, send_file, Response
from flask_restx import Namespace, Resource, fields, abort # type: ignore[import-untyped]
from werkzeug.security import check_password_hash
from lacuscore import CaptureStatus as CaptureStatusCore
from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError
from pylacus import CaptureStatus as CaptureStatusPy
from lookyloo import CaptureSettings, Lookyloo
from lookyloo.comparator import Comparator
@ -51,6 +51,15 @@ def handle_no_HAR_file_exception(error: Any) -> tuple[dict[str, str], int]:
return {'message': str(error)}, 400
@api.errorhandler(CaptureSettingsError) # type: ignore[misc]
def handle_pydandic_validation_exception(error: CaptureSettingsError) -> tuple[dict[str, Any], int]:
'''Return the validation error message and 400 status code'''
if error.pydantic_validation_errors:
return {'message': 'Unable to validate capture settings.',
'details': error.pydantic_validation_errors.errors()}, 400
return {'message': str(error)}, 400
@api.route('/json/get_user_config')
@api.doc(description='Get the configuration of the user (if any)', security='apikey')
class UserConfig(Resource): # type: ignore[misc]