From d5fb385adda3f9b35a28ab014ceb5e55d6c32732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 22 Jul 2024 13:14:21 +0200 Subject: [PATCH] chg: Improve errors handling, especially for broken CaptureSettings --- bin/async_capture.py | 2 +- bin/background_processing.py | 80 ++++++++++++++++++++---------------- lookyloo/lookyloo.py | 38 +++++++++++++---- website/web/__init__.py | 12 +++++- website/web/genericapi.py | 11 ++++- 5 files changed, 96 insertions(+), 47 deletions(-) diff --git a/bin/async_capture.py b/bin/async_capture.py index 6283524f..bcaff971 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -56,7 +56,7 @@ class AsyncCapture(AbstractManager): '''Get the list of captures ready to be processed''' # Only check if the top 50 in the priority list are done, as they are the most likely ones to be # and if the list it very very long, iterating over it takes a very long time. - return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf', start=0, num=50) + return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf', start=0, num=500) if uuid and self.lookyloo.lacus.get_capture_status(uuid) in [CaptureStatusPy.DONE, CaptureStatusCore.DONE]] def process_capture_queue(self) -> None: diff --git a/bin/background_processing.py b/bin/background_processing.py index 96fddbb2..155000fc 100755 --- a/bin/background_processing.py +++ b/bin/background_processing.py @@ -10,7 +10,7 @@ from collections import Counter from datetime import date, timedelta from typing import Any -from lacuscore import CaptureStatus as CaptureStatusCore +from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError from lookyloo import Lookyloo from lookyloo.exceptions import LacusUnreachable from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir @@ -109,41 +109,49 @@ class Processing(AbstractManager): continue self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.') # This capture couldn't be queued and we created the uuid locally - if query := self.lookyloo.get_capture_settings(uuid): - try: - new_uuid = self.lookyloo.lacus.enqueue( - url=query.url, - document_name=query.document_name, - document=query.document, - # depth=query.depth, - browser=query.browser, - device_name=query.device_name, - user_agent=query.user_agent, - proxy=query.proxy, - general_timeout_in_sec=query.general_timeout_in_sec, - cookies=query.cookies, - headers=query.headers, - http_credentials=query.http_credentials, - viewport=query.viewport, - referer=query.referer, - rendered_hostname_only=query.rendered_hostname_only, - # force=query.force, - # recapture_interval=query.recapture_interval, - priority=query.priority, - uuid=uuid - ) - if new_uuid != uuid: - # somehow, between the check and queuing, the UUID isn't UNKNOWN anymore, just checking that - self.logger.warning(f'Had to change the capture UUID (duplicate). Old: {uuid} / New: {new_uuid}') - except LacusUnreachable: - self.logger.warning('Lacus still unreachable.') - break - except Exception as e: - self.logger.warning(f'Still unable to enqueue capture: {e}') - break - else: - self.lookyloo.redis.hdel(uuid, 'not_queued') - self.logger.info(f'{uuid} enqueued.') + try: + if query := self.lookyloo.get_capture_settings(uuid): + try: + new_uuid = self.lookyloo.lacus.enqueue( + url=query.url, + document_name=query.document_name, + document=query.document, + # depth=query.depth, + browser=query.browser, + device_name=query.device_name, + user_agent=query.user_agent, + proxy=query.proxy, + general_timeout_in_sec=query.general_timeout_in_sec, + cookies=query.cookies, + headers=query.headers, + http_credentials=query.http_credentials, + viewport=query.viewport, + referer=query.referer, + rendered_hostname_only=query.rendered_hostname_only, + # force=query.force, + # recapture_interval=query.recapture_interval, + priority=query.priority, + uuid=uuid + ) + if new_uuid != uuid: + # somehow, between the check and queuing, the UUID isn't UNKNOWN anymore, just checking that + self.logger.warning(f'Had to change the capture UUID (duplicate). Old: {uuid} / New: {new_uuid}') + except LacusUnreachable: + self.logger.warning('Lacus still unreachable.') + break + except Exception as e: + self.logger.warning(f'Still unable to enqueue capture: {e}') + break + else: + self.lookyloo.redis.hdel(uuid, 'not_queued') + self.logger.info(f'{uuid} enqueued.') + except CaptureSettingsError as e: + self.logger.error(f'Broken settings for {uuid} made their way in the cache, removing them: {e}') + self.lookyloo.redis.zrem('to_capture', uuid) + self.lookyloo.redis.delete(uuid) + + except Exception as e: + self.logger.error(f'Unable to requeue {uuid}: {e}') def main() -> None: diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index cec80152..bfb16b6c 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -29,7 +29,7 @@ import mmh3 from defang import defang # type: ignore[import-untyped] from har2tree import CrawledTree, HostNode, URLNode -from lacuscore import (LacusCore, +from lacuscore import (LacusCore, CaptureSettingsError, CaptureStatus as CaptureStatusCore, # CaptureResponse as CaptureResponseCore) # CaptureResponseJson as CaptureResponseJsonCore, @@ -287,15 +287,25 @@ class Lookyloo(): return meta def get_capture_settings(self, capture_uuid: str, /) -> CaptureSettings | None: - if capture_settings := self.redis.hgetall(capture_uuid): - return CaptureSettings(**capture_settings) + '''Get the capture settings from the cache or the disk.''' + try: + if capture_settings := self.redis.hgetall(capture_uuid): + return CaptureSettings(**capture_settings) + except CaptureSettingsError as e: + self.logger.warning(f'Invalid capture settings for {capture_uuid}: {e}') + return None cache = self.capture_cache(capture_uuid) if not cache: return None cs_file = cache.capture_dir / 'capture_settings.json' if cs_file.exists(): - with cs_file.open('r') as f: - return CaptureSettings(**json.load(f)) + try: + with cs_file.open('r') as f: + return CaptureSettings(**json.load(f)) + except CaptureSettingsError as e: + self.logger.warning(f'[In file!] Invalid capture settings for {capture_uuid}: {e}') + return None + return None def categories_capture(self, capture_uuid: str, /) -> dict[str, Any]: @@ -650,7 +660,11 @@ class Lookyloo(): query.headers['dnt'] = query.dnt if authenticated: if user_config := load_user_config(user): - query = self._apply_user_config(query, user_config) + try: + query = self._apply_user_config(query, user_config) + except CaptureSettingsError as e: + self.logger.critical(f'Unable to apply user config for {user}: {e}') + raise e priority = get_priority(source, user, authenticated) if priority < -100: @@ -714,7 +728,11 @@ class Lookyloo(): if to_return['contacts']: to_return['all_emails'] |= set(to_return['contacts']) to_return['ips'] = {ip: self.uwhois.whois(ip, contact_email_only=True) for ip in set(hostnode.resolved_ips['v4']) | set(hostnode.resolved_ips['v6'])} - to_return['asns'] = {asn['asn']: self.uwhois.whois(f'AS{asn["asn"]}', contact_email_only=True) for asn in hostnode.ipasn.values()} + if hasattr(hostnode, 'ipasn'): + to_return['asns'] = {asn['asn']: self.uwhois.whois(f'AS{asn["asn"]}', contact_email_only=True) for asn in hostnode.ipasn.values()} + else: + self.logger.warning(f'No IPASN for {hostnode.name}') + to_return['asns'] = {} # try to get contact from security.txt file try: @@ -1429,7 +1447,11 @@ class Lookyloo(): error = lookyloo_capture.read(filename).decode() elif filename.endswith('capture_settings.json'): _capture_settings = json.loads(lookyloo_capture.read(filename)) - capture_settings = CaptureSettings(**_capture_settings) + try: + capture_settings = CaptureSettings(**_capture_settings) + except CaptureSettingsError as e: + unrecoverable_error = True + messages['errors'].append(f'Invalid Capture Settings: {e}') else: for to_skip in files_to_skip: if filename.endswith(to_skip): diff --git a/website/web/__init__.py b/website/web/__init__.py index 637ad5af..c991e3a0 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -32,7 +32,7 @@ from flask import (Flask, Response, Request, flash, jsonify, redirect, render_te from flask_bootstrap import Bootstrap5 # type: ignore[import-untyped] from flask_cors import CORS # type: ignore[import-untyped] from flask_restx import Api # type: ignore[import-untyped] -from lacuscore import CaptureStatus +from lacuscore import CaptureStatus, CaptureSettingsError from puremagic import from_string from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined] from werkzeug.security import check_password_hash @@ -283,6 +283,16 @@ def file_response(func): # type: ignore[no-untyped-def] return wrapper +@app.errorhandler(CaptureSettingsError) +def handle_pydandic_validation_exception(error: CaptureSettingsError) -> Response | str | WerkzeugResponse: + '''Return the validation error message and 400 status code''' + if error.pydantic_validation_errors: + flash(f'Unable to validate capture settings: {error.pydantic_validation_errors.errors()}') + else: + flash(str(error)) + return redirect(url_for('landing_page')) + + # ##### Methods querying the indexes ##### def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]: diff --git a/website/web/genericapi.py b/website/web/genericapi.py index 95c6bbff..2c898ccc 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -17,7 +17,7 @@ from flask import request, send_file, Response from flask_restx import Namespace, Resource, fields, abort # type: ignore[import-untyped] from werkzeug.security import check_password_hash -from lacuscore import CaptureStatus as CaptureStatusCore +from lacuscore import CaptureStatus as CaptureStatusCore, CaptureSettingsError from pylacus import CaptureStatus as CaptureStatusPy from lookyloo import CaptureSettings, Lookyloo from lookyloo.comparator import Comparator @@ -51,6 +51,15 @@ def handle_no_HAR_file_exception(error: Any) -> tuple[dict[str, str], int]: return {'message': str(error)}, 400 +@api.errorhandler(CaptureSettingsError) # type: ignore[misc] +def handle_pydandic_validation_exception(error: CaptureSettingsError) -> tuple[dict[str, Any], int]: + '''Return the validation error message and 400 status code''' + if error.pydantic_validation_errors: + return {'message': 'Unable to validate capture settings.', + 'details': error.pydantic_validation_errors.errors()}, 400 + return {'message': str(error)}, 400 + + @api.route('/json/get_user_config') @api.doc(description='Get the configuration of the user (if any)', security='apikey') class UserConfig(Resource): # type: ignore[misc]