new: Support lacus unreachable by caching locally

+ initialize lacus globally for consistency.
pull/545/head
Raphaël Vinot 2022-11-01 18:10:20 +01:00
parent dd0426537a
commit 9677c4d120
6 changed files with 163 additions and 108 deletions

View File

@ -11,10 +11,10 @@ from pathlib import Path
from typing import Dict, Optional, Union
from lacuscore import LacusCore, CaptureStatus as CaptureStatusCore, CaptureResponse as CaptureResponseCore
from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy
from redis import Redis
from pylacus import CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy
from lookyloo.default import AbstractManager, get_config, get_socket_path, safe_create_dir
from lookyloo.lookyloo import Lookyloo
from lookyloo.default import AbstractManager, get_config, safe_create_dir
from lookyloo.helpers import get_captures_dir
from lookyloo.modules import FOX
@ -30,25 +30,7 @@ class AsyncCapture(AbstractManager):
self.script_name = 'async_capture'
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
self.capture_dir: Path = get_captures_dir()
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'))
self.lacus: Union[PyLacus, LacusCore]
has_remote_lacus = False
if get_config('generic', 'remote_lacus'):
remote_lacus_config = get_config('generic', 'remote_lacus')
if remote_lacus_config.get('enable'):
self.logger.info("Remote lacus enabled, trying to set it up...")
remote_lacus_url = remote_lacus_config.get('url')
self.lacus = PyLacus(remote_lacus_url)
if self.lacus.is_up:
has_remote_lacus = True
self.logger.info(f"Remote lacus enabled to {remote_lacus_url}.")
else:
self.logger.warning(f"Unable to setup remote lacus to {remote_lacus_url}.")
if not has_remote_lacus:
self.lacus = LacusCore(self.redis, get_config('generic', 'tor_proxy'),
get_config('generic', 'only_global_lookups'))
self.lookyloo = Lookyloo()
self.captures: Dict[asyncio.Task, float] = {}
@ -65,21 +47,20 @@ class AsyncCapture(AbstractManager):
self.set_running()
uuid: Optional[str] = None
entries: Union[CaptureResponseCore, CaptureResponsePy]
if isinstance(self.lacus, LacusCore):
if uuid := await self.lacus.consume_queue():
entries = self.lacus.get_capture(uuid, decode=True)
if isinstance(self.lookyloo.lacus, LacusCore):
if uuid := await self.lookyloo.lacus.consume_queue():
entries = self.lookyloo.lacus.get_capture(uuid, decode=True)
if entries['status'] != CaptureStatusCore.DONE:
self.logger.warning(f'The capture {uuid} is reported as not done ({entries["status"]}) when it should.')
self.redis.zrem('to_capture', uuid)
self.redis.delete(uuid)
self.lookyloo.redis.zrem('to_capture', uuid)
self.lookyloo.redis.delete(uuid)
else:
# Find a capture that is done
try:
for uuid_b in self.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
uuid = uuid_b.decode()
for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
if not uuid:
break
entries = self.lacus.get_capture(uuid)
entries = self.lookyloo.lacus.get_capture(uuid)
if entries['status'] == CaptureStatusPy.DONE:
log = f'Got the capture for {uuid} from Lacus'
if runtime := entries.get('runtime'):
@ -92,33 +73,34 @@ class AsyncCapture(AbstractManager):
except Exception as e:
self.logger.critical(f'Error when getting captures from lacus, will retry later: {e}')
uuid = None
await asyncio.sleep(10)
if uuid is None:
self.unset_running()
return
self.redis.sadd('ongoing', uuid)
queue: Optional[bytes] = self.redis.getdel(f'{uuid}_mgmt')
self.lookyloo.redis.sadd('ongoing', uuid)
queue: Optional[str] = self.lookyloo.redis.getdel(f'{uuid}_mgmt')
to_capture: Dict[bytes, bytes] = self.redis.hgetall(uuid)
to_capture: Dict[str, str] = self.lookyloo.redis.hgetall(uuid)
if get_config('generic', 'default_public'):
# By default, the captures are on the index, unless the user mark them as un-listed
listing = False if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'false', b'0', b'']) else True
listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True
else:
# By default, the captures are not on the index, unless the user mark them as listed
listing = True if (b'listing' in to_capture and to_capture[b'listing'].lower() in [b'true', b'1']) else False
listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False
now = datetime.now()
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
safe_create_dir(dirpath)
if b'os' in to_capture or b'browser' in to_capture:
if 'os' in to_capture or 'browser' in to_capture:
meta: Dict[str, str] = {}
if b'os' in to_capture:
meta['os'] = to_capture[b'os'].decode()
if b'browser' in to_capture:
meta['browser'] = to_capture[b'browser'].decode()
if 'os' in to_capture:
meta['os'] = to_capture['os']
if 'browser' in to_capture:
meta['browser'] = to_capture['browser']
with (dirpath / 'meta').open('w') as _meta:
json.dump(meta, _meta)
@ -131,9 +113,9 @@ class AsyncCapture(AbstractManager):
(dirpath / 'no_index').touch()
# Write parent UUID (optional)
if b'parent' in to_capture:
if 'parent' in to_capture:
with (dirpath / 'parent').open('w') as _parent:
_parent.write(to_capture[b'parent'].decode())
_parent.write(to_capture['parent'])
if 'downloaded_filename' in entries and entries['downloaded_filename']:
with (dirpath / '0.data.filename').open('w') as _downloaded_filename:
@ -167,9 +149,9 @@ class AsyncCapture(AbstractManager):
with (dirpath / '0.cookies.json').open('w') as _cookies:
json.dump(entries['cookies'], _cookies)
lazy_cleanup = self.redis.pipeline()
lazy_cleanup = self.lookyloo.redis.pipeline()
lazy_cleanup.hset('lookup_dirs', uuid, str(dirpath))
if queue and self.redis.zscore('queues', queue):
if queue and self.lookyloo.redis.zscore('queues', queue):
lazy_cleanup.zincrby('queues', -1, queue)
lazy_cleanup.zrem('to_capture', uuid)
lazy_cleanup.srem('ongoing', uuid)

View File

@ -6,9 +6,8 @@ from collections import Counter
from datetime import date, timedelta
from typing import Any, Dict
from redis import Redis
from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path, safe_create_dir
from lookyloo.lookyloo import Lookyloo
from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir
from lookyloo.helpers import ParsedUserAgent, serialize_to_json
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
@ -19,13 +18,15 @@ class Processing(AbstractManager):
def __init__(self, loglevel: int=logging.INFO):
super().__init__(loglevel)
self.script_name = 'archiver'
self.script_name = 'processing'
self.lookyloo = Lookyloo()
self.use_own_ua = get_config('generic', 'use_user_agents_users')
def _to_run_forever(self):
if self.use_own_ua:
self._build_ua_file()
self._retry_failed_enqueue()
def _build_ua_file(self):
'''Build a file in a format compatible with the capture page'''
@ -34,11 +35,10 @@ class Processing(AbstractManager):
safe_create_dir(self_generated_ua_file_path)
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
if self_generated_ua_file.exists():
self.logger.info(f'User-agent file for {yesterday} already exists.')
self.logger.debug(f'User-agent file for {yesterday} already exists.')
return
self.logger.info(f'Generating user-agent file for {yesterday}')
redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
entries = self.lookyloo.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
if not entries:
self.logger.info(f'No User-agent file for {yesterday} to generate.')
return
@ -67,13 +67,49 @@ class Processing(AbstractManager):
json.dump(to_store, f, indent=2, default=serialize_to_json)
# Remove the UA / IP mapping.
redis.delete(f'user_agents|{yesterday.isoformat()}')
self.lookyloo.redis.delete(f'user_agents|{yesterday.isoformat()}')
self.logger.info(f'User-agent file for {yesterday} generated.')
def _retry_failed_enqueue(self):
'''If enqueuing failed, the settings are added, with a UUID in the 'to_capture key', and they have a UUID'''
for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
if self.lookyloo.redis.hexists(uuid, 'not_queued'):
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')
# This capture couldn't be queued and we created the uuid locally
query = self.lookyloo.redis.hgetall(uuid)
try:
self.lookyloo.lacus.enqueue(
url=query.get('url', None),
document_name=query.get('document_name', None),
document=query.get('document', None),
# depth=query.get('depth', 0),
browser=query.get('browser', None),
device_name=query.get('device_name', None),
user_agent=query.get('user_agent', None),
proxy=query.get('proxy', None),
general_timeout_in_sec=query.get('general_timeout_in_sec', None),
cookies=query.get('cookies', None),
headers=query.get('headers', None),
http_credentials=query.get('http_credentials', None),
viewport=query.get('viewport', None),
referer=query.get('referer', None),
rendered_hostname_only=query.get('rendered_hostname_only', True),
# force=query.get('force', False),
# recapture_interval=query.get('recapture_interval', 300),
priority=query.get('priority', None),
uuid=uuid
)
except Exception as e:
self.logger.warning(f'Still unable to enqueue capture: {e}')
break
else:
self.lookyloo.redis.hdel(uuid, 'not_queued')
self.logger.info(f'{uuid} enqueued.')
def main():
p = Processing()
p.run(sleep_in_sec=3600 * 24)
p.run(sleep_in_sec=30)
if __name__ == '__main__':

View File

@ -9,10 +9,12 @@ import smtplib
from collections import defaultdict
from datetime import date, datetime
from email.message import EmailMessage
from functools import cached_property
from io import BytesIO
from pathlib import Path
from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Set,
Tuple, Union)
from uuid import uuid4
from zipfile import ZipFile
from defang import defang # type: ignore
@ -108,27 +110,36 @@ class Lookyloo():
self._captures_index = CapturesIndex(self.redis, self.context)
self.logger.info('Index initialized.')
# init lacus
self.lacus
@property
def redis(self):
return Redis(connection_pool=self.redis_pool)
@cached_property
def lacus(self):
has_remote_lacus = False
self.lacus: Union[PyLacus, LacusCore]
self._lacus: Union[PyLacus, LacusCore]
if get_config('generic', 'remote_lacus'):
remote_lacus_config = get_config('generic', 'remote_lacus')
if remote_lacus_config.get('enable'):
self.logger.info("Remote lacus enabled, trying to set it up...")
remote_lacus_url = remote_lacus_config.get('url')
self.lacus = PyLacus(remote_lacus_url)
if self.lacus.is_up:
self._lacus = PyLacus(remote_lacus_url)
if self._lacus.is_up:
has_remote_lacus = True
self.logger.info(f"Remote lacus enabled to {remote_lacus_url}.")
else:
self.logger.warning(f"Unable to setup remote lacus to {remote_lacus_url}.")
raise LookylooException('Remote lacus is enabled but unreachable.')
if not has_remote_lacus:
self.lacus = LacusCore(self.redis, get_config('generic', 'tor_proxy'),
get_config('generic', 'only_global_lookups'))
@property
def redis(self):
return Redis(connection_pool=self.redis_pool)
# We need a redis connector that doesn't decode.
redis: Redis = Redis(unix_socket_path=get_socket_path('cache'))
self._lacus = LacusCore(redis, get_config('generic', 'tor_proxy'),
get_config('generic', 'only_global_lookups'))
return self._lacus
def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
@ -375,7 +386,15 @@ class Lookyloo():
elif self.redis.sismember('ongoing', capture_uuid):
# Post-processing on lookyloo's side
return CaptureStatusCore.ONGOING
lacus_status = self.lacus.get_capture_status(capture_uuid)
try:
lacus_status = self.lacus.get_capture_status(capture_uuid)
except Exception as e:
self.logger.warning(f'Unable to get the status for {capture_uuid} from lacus: {e}')
if self.redis.zscore('to_capture', capture_uuid) is not None:
return CaptureStatusCore.QUEUED
else:
return CaptureStatusCore.UNKNOWN
if (lacus_status == CaptureStatusCore.UNKNOWN
and self.redis.zscore('to_capture', capture_uuid) is not None):
# If we do the query before lacus picks it up, we will tell to the user that the UUID doesn't exists.
@ -473,41 +492,57 @@ class Lookyloo():
query = self._prepare_lacus_query(query)
priority = get_priority(source, user, authenticated)
perma_uuid = self.lacus.enqueue(
url=query.pop('url', None),
document_name=query.pop('document_name', None),
document=query.pop('document', None),
# depth=query.pop('depth', 0),
browser=query.pop('browser', None),
device_name=query.pop('device_name', None),
user_agent=query.pop('user_agent', None),
proxy=query.pop('proxy', None),
general_timeout_in_sec=query.pop('general_timeout_in_sec', None),
cookies=query.pop('cookies', None),
headers=query.pop('headers', None),
http_credentials=query.pop('http_credentials', None),
viewport=query.pop('viewport', None),
referer=query.pop('referer', None),
rendered_hostname_only=query.pop('rendered_hostname_only', True),
# force=query.pop('force', False),
# recapture_interval=query.pop('recapture_interval', 300),
priority=priority
)
query['priority'] = get_priority(source, user, authenticated)
if query['priority'] < -10:
# Someone is probably abusing the system with useless URLs, remove them from the index
query['listing'] = 0
try:
perma_uuid = self.lacus.enqueue(
url=query.get('url', None),
document_name=query.get('document_name', None),
document=query.get('document', None),
# depth=query.get('depth', 0),
browser=query.get('browser', None),
device_name=query.get('device_name', None),
user_agent=query.get('user_agent', None),
proxy=query.get('proxy', None),
general_timeout_in_sec=query.get('general_timeout_in_sec', None),
cookies=query.get('cookies', None),
headers=query.get('headers', None),
http_credentials=query.get('http_credentials', None),
viewport=query.get('viewport', None),
referer=query.get('referer', None),
rendered_hostname_only=query.get('rendered_hostname_only', True),
# force=query.get('force', False),
# recapture_interval=query.get('recapture_interval', 300),
priority=query.get('priority', 0)
)
except Exception as e:
self.logger.critical(f'Unable to enqueue capture: {e}')
perma_uuid = str(uuid4())
query['not_queued'] = 1
finally:
if (not self.redis.hexists('lookup_dirs', perma_uuid) # already captured
and self.redis.zscore('to_capture', perma_uuid) is None): # capture ongoing
if (not self.redis.hexists('lookup_dirs', perma_uuid) # already captured
and self.redis.zscore('to_capture', perma_uuid) is None): # capture ongoing
if priority < -10:
# Someone is probably abusing the system with useless URLs, remove them from the index
query['listing'] = 0
# Make the settings redis compatible
mapping_capture: Dict[str, Union[bytes, float, int, str]] = {}
for key, value in query.items():
if isinstance(value, bool):
mapping_capture[key] = 1 if value else 0
elif isinstance(value, (list, dict)):
if value:
mapping_capture[key] = json.dumps(value)
elif value is not None:
mapping_capture[key] = value
p = self.redis.pipeline()
p.zadd('to_capture', {perma_uuid: query['priority']})
p.hset(perma_uuid, mapping=mapping_capture)
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
p.execute()
p = self.redis.pipeline()
p.zadd('to_capture', {perma_uuid: priority})
if query:
p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
p.execute()
return perma_uuid
def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None:

20
poetry.lock generated
View File

@ -571,7 +571,7 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-
[[package]]
name = "lacuscore"
version = "1.1.2"
version = "1.1.3"
description = "Core of Lacus, usable as a module"
category = "main"
optional = false
@ -901,7 +901,7 @@ docs = ["Sphinx (>=5.1.1,<6.0.0)"]
[[package]]
name = "pylacus"
version = "1.1.0"
version = "1.1.1"
description = "Python CLI and module for lacus"
category = "main"
optional = false
@ -1045,7 +1045,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "pytz"
version = "2022.5"
version = "2022.6"
description = "World timezone definitions, modern and historical"
category = "main"
optional = false
@ -1476,7 +1476,7 @@ misp = ["python-magic", "pydeep2"]
[metadata]
lock-version = "1.1"
python-versions = ">=3.8,<3.12"
content-hash = "f274187b6e2cc2fd68d671405f24ddbadf6a362e68cea4eb5093fdce7ae9c55e"
content-hash = "5adbb7f2a6e81f21a199fc3f9bd1e4594dcc6a1dece1282f892edc5e95c4ba06"
[metadata.files]
aiohttp = [
@ -1931,8 +1931,8 @@ jsonschema = [
{file = "jsonschema-4.16.0.tar.gz", hash = "sha256:165059f076eff6971bae5b742fc029a7b4ef3f9bcf04c14e4776a7605de14b23"},
]
lacuscore = [
{file = "lacuscore-1.1.2-py3-none-any.whl", hash = "sha256:876d3ccb743bb4d43421d1670762af2f54c6b82dfdec9b5a4c37109dbabd02c6"},
{file = "lacuscore-1.1.2.tar.gz", hash = "sha256:ed83c8f4cb31e24ec0e39ce85fcd9dd675c0ae96bf9aaededf7c21469be6b1ad"},
{file = "lacuscore-1.1.3-py3-none-any.whl", hash = "sha256:8e9cd36083723423b6ecc2bac2da4cd97a35a92e8b3bab8907d583323e732a97"},
{file = "lacuscore-1.1.3.tar.gz", hash = "sha256:e02ea9fa594ce32ee5094917c08d8e20cdc0b2bf5dd939cbacf6b02103c4028e"},
]
lief = [
{file = "lief-0.12.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:cdadaab4b9ec756e1d1f0324acd6e280ae849d251e66f836da455df592deaf9e"},
@ -2358,8 +2358,8 @@ pyhashlookup = [
{file = "pyhashlookup-1.2.1.tar.gz", hash = "sha256:eb514cc1a5559a013a8882e101849fa52a37641f2a7d9dc21c0d266b37607aa5"},
]
pylacus = [
{file = "pylacus-1.1.0-py3-none-any.whl", hash = "sha256:880ce273fe35a554a35d6812e899e69b2f360fa5041822b136fbaf8f336a44d3"},
{file = "pylacus-1.1.0.tar.gz", hash = "sha256:90ca603bf58d19197f4776fffc58b9b98f34d12916457a026ff2cddbaa42fb6f"},
{file = "pylacus-1.1.1-py3-none-any.whl", hash = "sha256:61eab358c20b0fbb1915a91f82d5dc8ee3b8f22de7e23dfe2206af937e4d3728"},
{file = "pylacus-1.1.1.tar.gz", hash = "sha256:4eca75827ba977fb6dbeef587cf2a08324e4e4cdbc30a8762c695099be27a9dc"},
]
pylookyloo = [
{file = "pylookyloo-1.16.0-py3-none-any.whl", hash = "sha256:2c39e26eae61144e6bb986fbcb58604e9804b4d6b2fe1ff844d8b429db2628d2"},
@ -2422,8 +2422,8 @@ python-magic = [
{file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"},
]
pytz = [
{file = "pytz-2022.5-py2.py3-none-any.whl", hash = "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22"},
{file = "pytz-2022.5.tar.gz", hash = "sha256:c4d88f472f54d615e9cd582a5004d1e5f624854a6a27a6211591c251f22a6914"},
{file = "pytz-2022.6-py2.py3-none-any.whl", hash = "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427"},
{file = "pytz-2022.6.tar.gz", hash = "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2"},
]
pytz-deprecation-shim = [
{file = "pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl", hash = "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6"},

View File

@ -67,8 +67,8 @@ passivetotal = "^2.5.9"
werkzeug = "2.1.2"
filetype = "^1.1.0"
pypandora = "^1.2.0"
lacuscore = "^1.1.2"
pylacus = "^1.1.0"
lacuscore = "^1.1.3"
pylacus = "^1.1.1"
[tool.poetry.extras]
misp = ['python-magic', 'pydeep2']

View File

@ -67,6 +67,8 @@ class Monitoring():
to_return = []
for uuid, rank in captures_uuid:
capture_params = self.redis_cache.hgetall(uuid)
if 'document' in capture_params:
capture_params.pop('document')
if capture_params:
to_return.append((uuid, rank, capture_params))