diff --git a/bin/async_capture.py b/bin/async_capture.py index b69ccb46..b896994d 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -59,10 +59,11 @@ class AsyncCapture(AbstractManager): if 'cookies' in to_capture: to_capture['cookies_pseudofile'] = to_capture.pop('cookies') + self.logger.info(f'Capturing {to_capture["url"]} - {uuid}') if self._capture(**to_capture): # type: ignore - self.logger.info(f'Processed {to_capture["url"]}') + self.logger.info(f'Successfully captured {to_capture["url"]} - {uuid}') else: - self.logger.warning(f'Unable to capture {to_capture["url"]}') + self.logger.warning(f'Unable to capture {to_capture["url"]} - {uuid}') lazy_cleanup.srem('ongoing', uuid) lazy_cleanup.delete(uuid) # make sure to expire the key if nothing was processed for a while (= queues empty) diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index b66cc198..5c270c53 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -16,7 +16,6 @@ from functools import lru_cache from enum import IntEnum, unique from har2tree import CrawledTree, HostNode, URLNode -from redis import Redis import requests from requests.exceptions import HTTPError from publicsuffix2 import PublicSuffixList, fetch # type: ignore @@ -264,17 +263,6 @@ def get_useragent_for_requests(): return f'Lookyloo / {version}' -def get_capture_status(capture_uuid: str, /) -> CaptureStatus: - r = Redis(unix_socket_path=get_socket_path('cache')) - if r.zrank('to_capture', capture_uuid) is not None: - return CaptureStatus.QUEUED - elif r.hexists('lookup_dirs', capture_uuid): - return CaptureStatus.DONE - elif r.sismember('ongoing', capture_uuid): - return CaptureStatus.ONGOING - return CaptureStatus.UNKNOWN - - @lru_cache(64) def get_splash_url() -> str: if os.environ.get('SPLASH_URL_DOCKER'): diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index bd2b5261..8f9ca5da 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -31,7 +31,7 @@ from werkzeug.useragents import UserAgent from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingCaptureDirectory from .helpers import (get_homedir, get_socket_path, get_config, get_email_template, load_pickle_tree, remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains, - try_make_file, get_captures_dir, get_splash_url) + try_make_file, get_captures_dir, get_splash_url, CaptureStatus) from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois, UrlScan from .capturecache import CaptureCache from .context import Context @@ -563,6 +563,15 @@ class Lookyloo(): all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True) return all_cache + def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus: + if self.redis.zrank('to_capture', capture_uuid) is not None: + return CaptureStatus.QUEUED + elif self.redis.hexists('lookup_dirs', capture_uuid): + return CaptureStatus.DONE + elif self.redis.sismember('ongoing', capture_uuid): + return CaptureStatus.ONGOING + return CaptureStatus.UNKNOWN + def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]: """Get the cache from redis.""" if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects: @@ -570,7 +579,8 @@ class Lookyloo(): try: capture_dir = self._get_capture_dir(capture_uuid) except LookylooException: - self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).') + if self.get_capture_status(capture_uuid) not in [CaptureStatus.QUEUED, CaptureStatus.ONGOING]: + self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).') return None cached = self.redis.hgetall(str(capture_dir)) diff --git a/website/web/__init__.py b/website/web/__init__.py index 0d540087..e01284f6 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -22,7 +22,7 @@ from werkzeug.security import check_password_hash from pymisp import MISPEvent, MISPServerError from lookyloo.helpers import (get_user_agents, get_config, get_taxonomies, load_cookies, - CaptureStatus, splash_status, get_capture_status) + CaptureStatus, splash_status) from lookyloo.lookyloo import Lookyloo, Indexing from lookyloo.exceptions import NoValidHarFile, MissingUUID @@ -556,7 +556,7 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None): return redirect(url_for('index')) cache = lookyloo.capture_cache(tree_uuid) if not cache: - status = get_capture_status(tree_uuid) + status = lookyloo.get_capture_status(tree_uuid) splash_up, splash_message = splash_status() if not splash_up: flash(f'The capture module is not reachable ({splash_message}).', 'error') diff --git a/website/web/genericapi.py b/website/web/genericapi.py index 8a09041e..51d4cf1e 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -13,7 +13,7 @@ from werkzeug.security import check_password_hash from lookyloo.lookyloo import Lookyloo from .helpers import (src_request_ip, load_user_from_request, build_users_table) -from lookyloo.helpers import splash_status, get_capture_status +from lookyloo.helpers import splash_status api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/') @@ -71,7 +71,7 @@ class SplashStatus(Resource): params={'capture_uuid': 'The UUID of the capture'}) class CaptureStatusQuery(Resource): def get(self, capture_uuid: str): - return {'status_code': get_capture_status(capture_uuid)} + return {'status_code': lookyloo.get_capture_status(capture_uuid)} @api.route('/json//hostnames')