From 3c8fcb37007701753955abd14072174fb4b18684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 20 Sep 2022 14:49:58 +0200 Subject: [PATCH] chg: Bump lacuscore --- bin/async_capture.py | 63 +++++++++++++++++++---------------------- lookyloo/helpers.py | 9 ------ lookyloo/lookyloo.py | 15 +++++----- poetry.lock | 39 ++++++++++++++++++------- pyproject.toml | 5 ++-- website/web/__init__.py | 4 +-- 6 files changed, 70 insertions(+), 65 deletions(-) diff --git a/bin/async_capture.py b/bin/async_capture.py index c654d3e..5202d83 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -6,14 +6,13 @@ import logging from datetime import datetime from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, Optional, Set -from lacuscore import LacusCore -from redis.asyncio import Redis -from redis import Redis as RedisSync +from lacuscore import LacusCore, CaptureStatus +from redis import Redis from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir -from lookyloo.helpers import get_captures_dir, CaptureStatus +from lookyloo.helpers import get_captures_dir from lookyloo.modules import FOX @@ -28,8 +27,9 @@ class AsyncCapture(AbstractManager): self.script_name = 'async_capture' self.only_global_lookups: bool = get_config('generic', 'only_global_lookups') self.capture_dir: Path = get_captures_dir() - self.redis_sync: RedisSync = RedisSync(unix_socket_path=get_socket_path('cache')) - self.lacus = LacusCore(self.redis_sync) + self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache')) + self.lacus = LacusCore(self.redis) + self.captures: Set[asyncio.Task] = set() self.fox = FOX(get_config('modules', 'FOX')) if not self.fox.available: @@ -41,15 +41,13 @@ class AsyncCapture(AbstractManager): async def process_capture_queue(self) -> None: '''Process a query from the capture queue''' - value: List[Tuple[bytes, float]] = await self.redis.zpopmax('to_capture') - if not value or not value[0]: - # The queue was consumed by an other process. + uuid = await self.lacus.consume_queue() + if not uuid: return - uuid: str = value[0][0].decode() - queue: Optional[bytes] = await self.redis.getdel(f'{uuid}_mgmt') - await self.redis.sadd('ongoing', uuid) + self.redis.sadd('ongoing', uuid) + queue: Optional[bytes] = self.redis.getdel(f'{uuid}_mgmt') - to_capture: Dict[bytes, bytes] = await self.redis.hgetall(uuid) + to_capture: Dict[bytes, bytes] = self.redis.hgetall(uuid) if get_config('generic', 'default_public'): # By default, the captures are on the index, unless the user mark them as un-listed @@ -58,19 +56,17 @@ class AsyncCapture(AbstractManager): # By default, the captures are not on the index, unless the user mark them as listed listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False - status, result = await self.lacus.capture(uuid) - while True: entries = self.lacus.get_capture(uuid, decode=True) - if entries['status'] == CaptureStatus.DONE.value: + if entries['status'] == CaptureStatus.DONE: break - elif entries['status'] == CaptureStatus.UNKNOWN.value: + elif entries['status'] == CaptureStatus.UNKNOWN: self.logger.warning(f'Unable to find {uuid}.') break - elif entries['status'] == CaptureStatus.QUEUED.value: + elif entries['status'] == CaptureStatus.QUEUED: self.logger.info(f'{uuid} is in the queue.') await asyncio.sleep(5) - elif entries['status'] == CaptureStatus.ONGOING.value: + elif entries['status'] == CaptureStatus.ONGOING: self.logger.info(f'{uuid} is ongoing.') await asyncio.sleep(5) else: @@ -135,23 +131,22 @@ class AsyncCapture(AbstractManager): with (dirpath / '0.cookies.json').open('w') as _cookies: json.dump(entries['cookies'], _cookies) - async with self.redis.pipeline() as lazy_cleanup: - await lazy_cleanup.hset('lookup_dirs', uuid, str(dirpath)) - if queue and await self.redis.zscore('queues', queue): - await lazy_cleanup.zincrby('queues', -1, queue) - await lazy_cleanup.srem('ongoing', uuid) - await lazy_cleanup.delete(uuid) + with self.redis.pipeline() as lazy_cleanup: + lazy_cleanup.hset('lookup_dirs', uuid, str(dirpath)) + if queue and self.redis.zscore('queues', queue): + lazy_cleanup.zincrby('queues', -1, queue) + lazy_cleanup.srem('ongoing', uuid) + lazy_cleanup.delete(uuid) # make sure to expire the key if nothing was processed for a while (= queues empty) - await lazy_cleanup.expire('queues', 600) - await lazy_cleanup.execute() + lazy_cleanup.expire('queues', 600) + lazy_cleanup.execute() async def _to_run_forever_async(self): - self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache')) - while await self.redis.exists('to_capture'): - await self.process_capture_queue() - if self.shutdown_requested(): - break - await self.redis.close() + capture = asyncio.create_task(self.process_capture_queue()) + capture.add_done_callback(self.captures.discard) + self.captures.add(capture) + while len(self.captures) >= get_config('generic', 'async_capture_processes'): + await asyncio.sleep(1) def main(): diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index a670af4..b5a8bff 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -4,7 +4,6 @@ import json import logging from datetime import datetime, timedelta -from enum import IntEnum, unique from functools import lru_cache from importlib.metadata import version from io import BufferedIOBase @@ -26,14 +25,6 @@ from .default import get_homedir, safe_create_dir, get_config logger = logging.getLogger('Lookyloo - Helpers') -@unique -class CaptureStatus(IntEnum): - UNKNOWN = -1 - QUEUED = 0 - DONE = 1 - ONGOING = 2 - - # This method is used in json.dump or json.dumps calls as the default parameter: # json.dumps(..., default=dump_to_json) def serialize_to_json(obj: Union[Set]) -> Union[List]: diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 0f31438..a6e0b07 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -19,7 +19,7 @@ from zipfile import ZipFile from defang import defang # type: ignore from har2tree import CrawledTree, HostNode, URLNode -from lacuscore import LacusCore +from lacuscore import LacusCore, CaptureStatus from PIL import Image, UnidentifiedImageError from playwrightcapture import get_devices from pymisp import MISPAttribute, MISPEvent, MISPObject @@ -31,7 +31,7 @@ from .context import Context from .default import LookylooException, get_homedir, get_config, get_socket_path from .exceptions import (MissingCaptureDirectory, MissingUUID, TreeNeedsRebuild, NoValidHarFile) -from .helpers import (CaptureStatus, get_captures_dir, get_email_template, +from .helpers import (get_captures_dir, get_email_template, get_resources_hashes, get_taxonomies, uniq_domains, ParsedUserAgent, load_cookies, UserAgents) from .indexing import Indexing @@ -101,7 +101,8 @@ class Lookyloo(): self._captures_index = CapturesIndex(self.redis, self.context) self.logger.info('Index initialized.') - self.lacus = LacusCore(self.redis, get_config('generic', 'tor_proxy')) + self.lacus = LacusCore(self.redis, get_config('generic', 'tor_proxy'), + get_config('generic', 'only_global_lookups')) @property def redis(self): @@ -347,13 +348,12 @@ class Lookyloo(): def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus: '''Returns the status (queued, ongoing, done, or UUID unknown)''' - if self.redis.zrank('to_capture', capture_uuid) is not None: - return CaptureStatus.QUEUED - elif self.redis.hexists('lookup_dirs', capture_uuid): + if self.redis.hexists('lookup_dirs', capture_uuid): return CaptureStatus.DONE elif self.redis.sismember('ongoing', capture_uuid): + # Post-processing on lookyloo's side return CaptureStatus.ONGOING - return CaptureStatus.UNKNOWN + return self.lacus.get_capture_status(capture_uuid) def try_error_status(self, capture_uuid: str, /) -> Optional[str]: '''If it is not possible to do the capture, we store the error for a short amount of time''' @@ -461,7 +461,6 @@ class Lookyloo(): # Someone is probably abusing the system with useless URLs, remove them from the index query['listing'] = 0 p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific - p.zadd('to_capture', {perma_uuid: priority}) p.zincrby('queues', 1, f'{source}|{authenticated}|{user}') p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}') p.execute() diff --git a/poetry.lock b/poetry.lock index 49e26d6..fba1c81 100644 --- a/poetry.lock +++ b/poetry.lock @@ -140,7 +140,7 @@ python-versions = "*" [[package]] name = "certifi" -version = "2022.6.15.2" +version = "2022.9.14" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false @@ -436,7 +436,7 @@ python-versions = ">=3.6" [[package]] name = "idna" -version = "3.3" +version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" category = "main" optional = false @@ -563,6 +563,21 @@ pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] +[[package]] +name = "lacuscore" +version = "0.2.0" +description = "Core of Lacus, usable as a module" +category = "main" +optional = false +python-versions = ">=3.8,<4.0" + +[package.dependencies] +playwrightcapture = ">=1.15.2,<2.0.0" +requests = ">=2.28.1,<3.0.0" + +[package.extras] +docs = ["Sphinx (>=5.1.1,<6.0.0)"] + [[package]] name = "lief" version = "0.12.1" @@ -741,7 +756,7 @@ websockets = "10.1" [[package]] name = "playwrightcapture" -version = "1.15.1" +version = "1.15.2" description = "A simple library to capture websites using playwright" category = "main" optional = false @@ -1437,7 +1452,7 @@ misp = ["python-magic", "pydeep2"] [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "46db3fd177fb45d82947eb86e58fd45d5c7396b712852b76e22d0ad9901abc3a" +content-hash = "90f66d42b27d094218cd3400cde9ca71c9b21ab0107a63b9fe3cffb4758ba47e" [metadata.files] aiohttp = [ @@ -1600,8 +1615,8 @@ cchardet = [ {file = "cchardet-2.1.7.tar.gz", hash = "sha256:c428b6336545053c2589f6caf24ea32276c6664cb86db817e03a94c60afa0eaf"}, ] certifi = [ - {file = "certifi-2022.6.15.2-py3-none-any.whl", hash = "sha256:0aa1a42fbd57645fabeb6290a7687c21755b0344ecaeaa05f4e9f6207ae2e9a8"}, - {file = "certifi-2022.6.15.2.tar.gz", hash = "sha256:aa08c101214127b9b0472ca6338315113c9487d45376fd3e669201b477c71003"}, + {file = "certifi-2022.9.14-py3-none-any.whl", hash = "sha256:e232343de1ab72c2aa521b625c80f699e356830fd0e2c620b465b304b17b0516"}, + {file = "certifi-2022.9.14.tar.gz", hash = "sha256:36973885b9542e6bd01dea287b2b4b3b21236307c56324fcc3f1160f2d655ed5"}, ] chardet = [ {file = "chardet-5.0.0-py3-none-any.whl", hash = "sha256:d3e64f022d254183001eccc5db4040520c0f23b1a3f33d6413e099eb7f126557"}, @@ -1846,8 +1861,8 @@ hiredis = [ {file = "hiredis-2.0.0.tar.gz", hash = "sha256:81d6d8e39695f2c37954d1011c0480ef7cf444d4e3ae24bc5e89ee5de360139a"}, ] idna = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] importlib-metadata = [ {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, @@ -1877,6 +1892,10 @@ jsonschema = [ {file = "jsonschema-4.16.0-py3-none-any.whl", hash = "sha256:9e74b8f9738d6a946d70705dc692b74b5429cd0960d58e79ffecfc43b2221eb9"}, {file = "jsonschema-4.16.0.tar.gz", hash = "sha256:165059f076eff6971bae5b742fc029a7b4ef3f9bcf04c14e4776a7605de14b23"}, ] +lacuscore = [ + {file = "lacuscore-0.2.0-py3-none-any.whl", hash = "sha256:3ab0bb52f82a834dc24f9fbeefd39b9dd7694953f73b1e0621e9e876ea827a4c"}, + {file = "lacuscore-0.2.0.tar.gz", hash = "sha256:9b7f54b4ce9deba3c8b6f7566e523de1523ba560095a0567da0ffe45baa4417b"}, +] lief = [ {file = "lief-0.12.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fbbc9d520de87ac22210c62d22a9b088e5460f9a028741311e6f68ef8877ddd"}, {file = "lief-0.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:443e4494df448ea1a021976258c7a6aca27d81b0612783fa3a84fab196fb9fcb"}, @@ -2235,8 +2254,8 @@ playwright = [ {file = "playwright-1.25.2-py3-none-win_amd64.whl", hash = "sha256:68ae739f82b78717123eb9d1b28b4619f0b368b88ef73c633681e267680697cd"}, ] playwrightcapture = [ - {file = "PlaywrightCapture-1.15.1.tar.gz", hash = "sha256:af8efda02e0cf7df32dd4d5d3b72bd04fae0a0e22521195989205fe40f9dfb59"}, - {file = "playwrightcapture-1.15.1-py3-none-any.whl", hash = "sha256:7ea84dc4590ad2bd0d26dc2e6019ca6873c676f9b901eb13dbfef69c916e5e5c"}, + {file = "PlaywrightCapture-1.15.2.tar.gz", hash = "sha256:a8a00dd779b7bf0dee18fbe3c19314de3c7dd2387a42f26c0784474b8b2e485f"}, + {file = "playwrightcapture-1.15.2-py3-none-any.whl", hash = "sha256:297aaf265a2646bf9e58632f2322dd5b89bd1874491f1dd0f275eaebe34ebc11"}, ] prompt-toolkit = [ {file = "prompt_toolkit-3.0.31-py3-none-any.whl", hash = "sha256:9696f386133df0fc8ca5af4895afe5d78f5fcfe5258111c2a79a1c3e41ffa96d"}, diff --git a/pyproject.toml b/pyproject.toml index 41d93f6..7113cf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,11 +63,12 @@ lief = "^0.12.1" ua-parser = "^0.16.1" Flask-Login = "^0.6.2" har2tree = "^1.15.4" -playwrightcapture = "^1.15.1" +playwrightcapture = "^1.15.2" passivetotal = "^2.5.9" werkzeug = "2.1.2" filetype = "^1.1.0" pypandora = "^1.1.2" +lacuscore = "^0.2.0" [tool.poetry.extras] misp = ['python-magic', 'pydeep2'] @@ -76,7 +77,7 @@ misp = ['python-magic', 'pydeep2'] mypy = "^0.971" ipython = "^8.5.0" types-redis = "^4.3.20" -types-requests = "^2.28.9" +types-requests = "^2.28.10" types-Flask = "^1.1.6" types-pkg-resources = "^0.1.3" types-Deprecated = "^1.2.9" diff --git a/website/web/__init__.py b/website/web/__init__.py index 65d4307..8195c5e 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -21,13 +21,13 @@ from flask import (Flask, Response, flash, jsonify, redirect, render_template, from flask_bootstrap import Bootstrap5 # type: ignore from flask_cors import CORS # type: ignore from flask_restx import Api # type: ignore +from lacuscore import CaptureStatus from pymisp import MISPEvent, MISPServerError from werkzeug.security import check_password_hash from lookyloo.default import get_config from lookyloo.exceptions import MissingUUID, NoValidHarFile -from lookyloo.helpers import (CaptureStatus, get_taxonomies, - UserAgents, load_cookies) +from lookyloo.helpers import get_taxonomies, UserAgents, load_cookies from lookyloo.lookyloo import Indexing, Lookyloo from .genericapi import api as generic_api