From 9677c4d1206a599c58982d556735dec2c0e00ae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 1 Nov 2022 18:10:20 +0100 Subject: [PATCH] new: Support lacus unreachable by caching locally + initialize lacus globally for consistency. --- bin/async_capture.py | 70 ++++++++------------ bin/background_processing.py | 54 +++++++++++++--- lookyloo/lookyloo.py | 121 ++++++++++++++++++++++------------- poetry.lock | 20 +++--- pyproject.toml | 4 +- tools/monitoring.py | 2 + 6 files changed, 163 insertions(+), 108 deletions(-) diff --git a/bin/async_capture.py b/bin/async_capture.py index a9ac6369..2d5b6095 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -11,10 +11,10 @@ from pathlib import Path from typing import Dict, Optional, Union from lacuscore import LacusCore, CaptureStatus as CaptureStatusCore, CaptureResponse as CaptureResponseCore -from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy -from redis import Redis +from pylacus import CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy -from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir +from lookyloo.lookyloo import Lookyloo +from lookyloo.default import AbstractManager, get_config, safe_create_dir from lookyloo.helpers import get_captures_dir from lookyloo.modules import FOX @@ -30,25 +30,7 @@ class AsyncCapture(AbstractManager): self.script_name = 'async_capture' self.only_global_lookups: bool = get_config('generic', 'only_global_lookups') self.capture_dir: Path = get_captures_dir() - self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache')) - - self.lacus: Union[PyLacus, LacusCore] - has_remote_lacus = False - if get_config('generic', 'remote_lacus'): - remote_lacus_config = get_config('generic', 'remote_lacus') - if remote_lacus_config.get('enable'): - self.logger.info("Remote lacus enabled, trying to set it up...") - remote_lacus_url = remote_lacus_config.get('url') - self.lacus = PyLacus(remote_lacus_url) - if self.lacus.is_up: - has_remote_lacus = True - self.logger.info(f"Remote lacus enabled to {remote_lacus_url}.") - else: - self.logger.warning(f"Unable to setup remote lacus to {remote_lacus_url}.") - - if not has_remote_lacus: - self.lacus = LacusCore(self.redis, get_config('generic', 'tor_proxy'), - get_config('generic', 'only_global_lookups')) + self.lookyloo = Lookyloo() self.captures: Dict[asyncio.Task, float] = {} @@ -65,21 +47,20 @@ class AsyncCapture(AbstractManager): self.set_running() uuid: Optional[str] = None entries: Union[CaptureResponseCore, CaptureResponsePy] - if isinstance(self.lacus, LacusCore): - if uuid := await self.lacus.consume_queue(): - entries = self.lacus.get_capture(uuid, decode=True) + if isinstance(self.lookyloo.lacus, LacusCore): + if uuid := await self.lookyloo.lacus.consume_queue(): + entries = self.lookyloo.lacus.get_capture(uuid, decode=True) if entries['status'] != CaptureStatusCore.DONE: self.logger.warning(f'The capture {uuid} is reported as not done ({entries["status"]}) when it should.') - self.redis.zrem('to_capture', uuid) - self.redis.delete(uuid) + self.lookyloo.redis.zrem('to_capture', uuid) + self.lookyloo.redis.delete(uuid) else: # Find a capture that is done try: - for uuid_b in self.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'): - uuid = uuid_b.decode() + for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'): if not uuid: break - entries = self.lacus.get_capture(uuid) + entries = self.lookyloo.lacus.get_capture(uuid) if entries['status'] == CaptureStatusPy.DONE: log = f'Got the capture for {uuid} from Lacus' if runtime := entries.get('runtime'): @@ -92,33 +73,34 @@ class AsyncCapture(AbstractManager): except Exception as e: self.logger.critical(f'Error when getting captures from lacus, will retry later: {e}') uuid = None + await asyncio.sleep(10) if uuid is None: self.unset_running() return - self.redis.sadd('ongoing', uuid) - queue: Optional[bytes] = self.redis.getdel(f'{uuid}_mgmt') + self.lookyloo.redis.sadd('ongoing', uuid) + queue: Optional[str] = self.lookyloo.redis.getdel(f'{uuid}_mgmt') - to_capture: Dict[bytes, bytes] = self.redis.hgetall(uuid) + to_capture: Dict[str, str] = self.lookyloo.redis.hgetall(uuid) if get_config('generic', 'default_public'): # By default, the captures are on the index, unless the user mark them as un-listed - listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True + listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True else: # By default, the captures are not on the index, unless the user mark them as listed - listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False + listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False now = datetime.now() dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat() safe_create_dir(dirpath) - if b'os' in to_capture or b'browser' in to_capture: + if 'os' in to_capture or 'browser' in to_capture: meta: Dict[str, str] = {} - if b'os' in to_capture: - meta['os'] = to_capture[b'os'].decode() - if b'browser' in to_capture: - meta['browser'] = to_capture[b'browser'].decode() + if 'os' in to_capture: + meta['os'] = to_capture['os'] + if 'browser' in to_capture: + meta['browser'] = to_capture['browser'] with (dirpath / 'meta').open('w') as _meta: json.dump(meta, _meta) @@ -131,9 +113,9 @@ class AsyncCapture(AbstractManager): (dirpath / 'no_index').touch() # Write parent UUID (optional) - if b'parent' in to_capture: + if 'parent' in to_capture: with (dirpath / 'parent').open('w') as _parent: - _parent.write(to_capture[b'parent'].decode()) + _parent.write(to_capture['parent']) if 'downloaded_filename' in entries and entries['downloaded_filename']: with (dirpath / '0.data.filename').open('w') as _downloaded_filename: @@ -167,9 +149,9 @@ class AsyncCapture(AbstractManager): with (dirpath / '0.cookies.json').open('w') as _cookies: json.dump(entries['cookies'], _cookies) - lazy_cleanup = self.redis.pipeline() + lazy_cleanup = self.lookyloo.redis.pipeline() lazy_cleanup.hset('lookup_dirs', uuid, str(dirpath)) - if queue and self.redis.zscore('queues', queue): + if queue and self.lookyloo.redis.zscore('queues', queue): lazy_cleanup.zincrby('queues', -1, queue) lazy_cleanup.zrem('to_capture', uuid) lazy_cleanup.srem('ongoing', uuid) diff --git a/bin/background_processing.py b/bin/background_processing.py index 369e5adf..a44c5bd7 100755 --- a/bin/background_processing.py +++ b/bin/background_processing.py @@ -6,9 +6,8 @@ from collections import Counter from datetime import date, timedelta from typing import Any, Dict -from redis import Redis - -from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path, safe_create_dir +from lookyloo.lookyloo import Lookyloo +from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir from lookyloo.helpers import ParsedUserAgent, serialize_to_json logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', @@ -19,13 +18,15 @@ class Processing(AbstractManager): def __init__(self, loglevel: int=logging.INFO): super().__init__(loglevel) - self.script_name = 'archiver' + self.script_name = 'processing' + self.lookyloo = Lookyloo() self.use_own_ua = get_config('generic', 'use_user_agents_users') def _to_run_forever(self): if self.use_own_ua: self._build_ua_file() + self._retry_failed_enqueue() def _build_ua_file(self): '''Build a file in a format compatible with the capture page''' @@ -34,11 +35,10 @@ class Processing(AbstractManager): safe_create_dir(self_generated_ua_file_path) self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json' if self_generated_ua_file.exists(): - self.logger.info(f'User-agent file for {yesterday} already exists.') + self.logger.debug(f'User-agent file for {yesterday} already exists.') return self.logger.info(f'Generating user-agent file for {yesterday}') - redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) - entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1) + entries = self.lookyloo.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1) if not entries: self.logger.info(f'No User-agent file for {yesterday} to generate.') return @@ -67,13 +67,49 @@ class Processing(AbstractManager): json.dump(to_store, f, indent=2, default=serialize_to_json) # Remove the UA / IP mapping. - redis.delete(f'user_agents|{yesterday.isoformat()}') + self.lookyloo.redis.delete(f'user_agents|{yesterday.isoformat()}') self.logger.info(f'User-agent file for {yesterday} generated.') + def _retry_failed_enqueue(self): + '''If enqueuing failed, the settings are added, with a UUID in the 'to_capture key', and they have a UUID''' + for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'): + if self.lookyloo.redis.hexists(uuid, 'not_queued'): + self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.') + # This capture couldn't be queued and we created the uuid locally + query = self.lookyloo.redis.hgetall(uuid) + try: + self.lookyloo.lacus.enqueue( + url=query.get('url', None), + document_name=query.get('document_name', None), + document=query.get('document', None), + # depth=query.get('depth', 0), + browser=query.get('browser', None), + device_name=query.get('device_name', None), + user_agent=query.get('user_agent', None), + proxy=query.get('proxy', None), + general_timeout_in_sec=query.get('general_timeout_in_sec', None), + cookies=query.get('cookies', None), + headers=query.get('headers', None), + http_credentials=query.get('http_credentials', None), + viewport=query.get('viewport', None), + referer=query.get('referer', None), + rendered_hostname_only=query.get('rendered_hostname_only', True), + # force=query.get('force', False), + # recapture_interval=query.get('recapture_interval', 300), + priority=query.get('priority', None), + uuid=uuid + ) + except Exception as e: + self.logger.warning(f'Still unable to enqueue capture: {e}') + break + else: + self.lookyloo.redis.hdel(uuid, 'not_queued') + self.logger.info(f'{uuid} enqueued.') + def main(): p = Processing() - p.run(sleep_in_sec=3600 * 24) + p.run(sleep_in_sec=30) if __name__ == '__main__': diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 58ae3755..1a3545ad 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -9,10 +9,12 @@ import smtplib from collections import defaultdict from datetime import date, datetime from email.message import EmailMessage +from functools import cached_property from io import BytesIO from pathlib import Path from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Set, Tuple, Union) +from uuid import uuid4 from zipfile import ZipFile from defang import defang # type: ignore @@ -108,27 +110,36 @@ class Lookyloo(): self._captures_index = CapturesIndex(self.redis, self.context) self.logger.info('Index initialized.') + # init lacus + self.lacus + + @property + def redis(self): + return Redis(connection_pool=self.redis_pool) + + @cached_property + def lacus(self): has_remote_lacus = False - self.lacus: Union[PyLacus, LacusCore] + self._lacus: Union[PyLacus, LacusCore] if get_config('generic', 'remote_lacus'): remote_lacus_config = get_config('generic', 'remote_lacus') if remote_lacus_config.get('enable'): self.logger.info("Remote lacus enabled, trying to set it up...") remote_lacus_url = remote_lacus_config.get('url') - self.lacus = PyLacus(remote_lacus_url) - if self.lacus.is_up: + self._lacus = PyLacus(remote_lacus_url) + if self._lacus.is_up: has_remote_lacus = True self.logger.info(f"Remote lacus enabled to {remote_lacus_url}.") else: self.logger.warning(f"Unable to setup remote lacus to {remote_lacus_url}.") + raise LookylooException('Remote lacus is enabled but unreachable.') if not has_remote_lacus: - self.lacus = LacusCore(self.redis, get_config('generic', 'tor_proxy'), - get_config('generic', 'only_global_lookups')) - - @property - def redis(self): - return Redis(connection_pool=self.redis_pool) + # We need a redis connector that doesn't decode. + redis: Redis = Redis(unix_socket_path=get_socket_path('cache')) + self._lacus = LacusCore(redis, get_config('generic', 'tor_proxy'), + get_config('generic', 'only_global_lookups')) + return self._lacus def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str, legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]): @@ -375,7 +386,15 @@ class Lookyloo(): elif self.redis.sismember('ongoing', capture_uuid): # Post-processing on lookyloo's side return CaptureStatusCore.ONGOING - lacus_status = self.lacus.get_capture_status(capture_uuid) + try: + lacus_status = self.lacus.get_capture_status(capture_uuid) + except Exception as e: + self.logger.warning(f'Unable to get the status for {capture_uuid} from lacus: {e}') + if self.redis.zscore('to_capture', capture_uuid) is not None: + return CaptureStatusCore.QUEUED + else: + return CaptureStatusCore.UNKNOWN + if (lacus_status == CaptureStatusCore.UNKNOWN and self.redis.zscore('to_capture', capture_uuid) is not None): # If we do the query before lacus picks it up, we will tell to the user that the UUID doesn't exists. @@ -473,41 +492,57 @@ class Lookyloo(): query = self._prepare_lacus_query(query) - priority = get_priority(source, user, authenticated) - perma_uuid = self.lacus.enqueue( - url=query.pop('url', None), - document_name=query.pop('document_name', None), - document=query.pop('document', None), - # depth=query.pop('depth', 0), - browser=query.pop('browser', None), - device_name=query.pop('device_name', None), - user_agent=query.pop('user_agent', None), - proxy=query.pop('proxy', None), - general_timeout_in_sec=query.pop('general_timeout_in_sec', None), - cookies=query.pop('cookies', None), - headers=query.pop('headers', None), - http_credentials=query.pop('http_credentials', None), - viewport=query.pop('viewport', None), - referer=query.pop('referer', None), - rendered_hostname_only=query.pop('rendered_hostname_only', True), - # force=query.pop('force', False), - # recapture_interval=query.pop('recapture_interval', 300), - priority=priority - ) + query['priority'] = get_priority(source, user, authenticated) + if query['priority'] < -10: + # Someone is probably abusing the system with useless URLs, remove them from the index + query['listing'] = 0 + try: + perma_uuid = self.lacus.enqueue( + url=query.get('url', None), + document_name=query.get('document_name', None), + document=query.get('document', None), + # depth=query.get('depth', 0), + browser=query.get('browser', None), + device_name=query.get('device_name', None), + user_agent=query.get('user_agent', None), + proxy=query.get('proxy', None), + general_timeout_in_sec=query.get('general_timeout_in_sec', None), + cookies=query.get('cookies', None), + headers=query.get('headers', None), + http_credentials=query.get('http_credentials', None), + viewport=query.get('viewport', None), + referer=query.get('referer', None), + rendered_hostname_only=query.get('rendered_hostname_only', True), + # force=query.get('force', False), + # recapture_interval=query.get('recapture_interval', 300), + priority=query.get('priority', 0) + ) + except Exception as e: + self.logger.critical(f'Unable to enqueue capture: {e}') + perma_uuid = str(uuid4()) + query['not_queued'] = 1 + finally: + if (not self.redis.hexists('lookup_dirs', perma_uuid) # already captured + and self.redis.zscore('to_capture', perma_uuid) is None): # capture ongoing - if (not self.redis.hexists('lookup_dirs', perma_uuid) # already captured - and self.redis.zscore('to_capture', perma_uuid) is None): # capture ongoing - if priority < -10: - # Someone is probably abusing the system with useless URLs, remove them from the index - query['listing'] = 0 + # Make the settings redis compatible + mapping_capture: Dict[str, Union[bytes, float, int, str]] = {} + for key, value in query.items(): + if isinstance(value, bool): + mapping_capture[key] = 1 if value else 0 + elif isinstance(value, (list, dict)): + if value: + mapping_capture[key] = json.dumps(value) + elif value is not None: + mapping_capture[key] = value + + p = self.redis.pipeline() + p.zadd('to_capture', {perma_uuid: query['priority']}) + p.hset(perma_uuid, mapping=mapping_capture) + p.zincrby('queues', 1, f'{source}|{authenticated}|{user}') + p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}') + p.execute() - p = self.redis.pipeline() - p.zadd('to_capture', {perma_uuid: priority}) - if query: - p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific - p.zincrby('queues', 1, f'{source}|{authenticated}|{user}') - p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}') - p.execute() return perma_uuid def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None: diff --git a/poetry.lock b/poetry.lock index 51c42861..5ed96f9d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -571,7 +571,7 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- [[package]] name = "lacuscore" -version = "1.1.2" +version = "1.1.3" description = "Core of Lacus, usable as a module" category = "main" optional = false @@ -901,7 +901,7 @@ docs = ["Sphinx (>=5.1.1,<6.0.0)"] [[package]] name = "pylacus" -version = "1.1.0" +version = "1.1.1" description = "Python CLI and module for lacus" category = "main" optional = false @@ -1045,7 +1045,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "pytz" -version = "2022.5" +version = "2022.6" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -1476,7 +1476,7 @@ misp = ["python-magic", "pydeep2"] [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.12" -content-hash = "f274187b6e2cc2fd68d671405f24ddbadf6a362e68cea4eb5093fdce7ae9c55e" +content-hash = "5adbb7f2a6e81f21a199fc3f9bd1e4594dcc6a1dece1282f892edc5e95c4ba06" [metadata.files] aiohttp = [ @@ -1931,8 +1931,8 @@ jsonschema = [ {file = "jsonschema-4.16.0.tar.gz", hash = "sha256:165059f076eff6971bae5b742fc029a7b4ef3f9bcf04c14e4776a7605de14b23"}, ] lacuscore = [ - {file = "lacuscore-1.1.2-py3-none-any.whl", hash = "sha256:876d3ccb743bb4d43421d1670762af2f54c6b82dfdec9b5a4c37109dbabd02c6"}, - {file = "lacuscore-1.1.2.tar.gz", hash = "sha256:ed83c8f4cb31e24ec0e39ce85fcd9dd675c0ae96bf9aaededf7c21469be6b1ad"}, + {file = "lacuscore-1.1.3-py3-none-any.whl", hash = "sha256:8e9cd36083723423b6ecc2bac2da4cd97a35a92e8b3bab8907d583323e732a97"}, + {file = "lacuscore-1.1.3.tar.gz", hash = "sha256:e02ea9fa594ce32ee5094917c08d8e20cdc0b2bf5dd939cbacf6b02103c4028e"}, ] lief = [ {file = "lief-0.12.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:cdadaab4b9ec756e1d1f0324acd6e280ae849d251e66f836da455df592deaf9e"}, @@ -2358,8 +2358,8 @@ pyhashlookup = [ {file = "pyhashlookup-1.2.1.tar.gz", hash = "sha256:eb514cc1a5559a013a8882e101849fa52a37641f2a7d9dc21c0d266b37607aa5"}, ] pylacus = [ - {file = "pylacus-1.1.0-py3-none-any.whl", hash = "sha256:880ce273fe35a554a35d6812e899e69b2f360fa5041822b136fbaf8f336a44d3"}, - {file = "pylacus-1.1.0.tar.gz", hash = "sha256:90ca603bf58d19197f4776fffc58b9b98f34d12916457a026ff2cddbaa42fb6f"}, + {file = "pylacus-1.1.1-py3-none-any.whl", hash = "sha256:61eab358c20b0fbb1915a91f82d5dc8ee3b8f22de7e23dfe2206af937e4d3728"}, + {file = "pylacus-1.1.1.tar.gz", hash = "sha256:4eca75827ba977fb6dbeef587cf2a08324e4e4cdbc30a8762c695099be27a9dc"}, ] pylookyloo = [ {file = "pylookyloo-1.16.0-py3-none-any.whl", hash = "sha256:2c39e26eae61144e6bb986fbcb58604e9804b4d6b2fe1ff844d8b429db2628d2"}, @@ -2422,8 +2422,8 @@ python-magic = [ {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, ] pytz = [ - {file = "pytz-2022.5-py2.py3-none-any.whl", hash = "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22"}, - {file = "pytz-2022.5.tar.gz", hash = "sha256:c4d88f472f54d615e9cd582a5004d1e5f624854a6a27a6211591c251f22a6914"}, + {file = "pytz-2022.6-py2.py3-none-any.whl", hash = "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427"}, + {file = "pytz-2022.6.tar.gz", hash = "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2"}, ] pytz-deprecation-shim = [ {file = "pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl", hash = "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6"}, diff --git a/pyproject.toml b/pyproject.toml index 9f79a00f..ce84765b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,8 +67,8 @@ passivetotal = "^2.5.9" werkzeug = "2.1.2" filetype = "^1.1.0" pypandora = "^1.2.0" -lacuscore = "^1.1.2" -pylacus = "^1.1.0" +lacuscore = "^1.1.3" +pylacus = "^1.1.1" [tool.poetry.extras] misp = ['python-magic', 'pydeep2'] diff --git a/tools/monitoring.py b/tools/monitoring.py index 3824ec29..33a74bb5 100755 --- a/tools/monitoring.py +++ b/tools/monitoring.py @@ -67,6 +67,8 @@ class Monitoring(): to_return = [] for uuid, rank in captures_uuid: capture_params = self.redis_cache.hgetall(uuid) + if 'document' in capture_params: + capture_params.pop('document') if capture_params: to_return.append((uuid, rank, capture_params))