2019-01-29 18:37:13 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
import asyncio
|
2023-04-28 17:19:49 +02:00
|
|
|
import json
|
2019-01-29 18:37:13 +01:00
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2022-10-28 12:40:28 +02:00
|
|
|
import signal
|
2022-08-04 16:58:07 +02:00
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
2022-10-26 14:25:23 +02:00
|
|
|
from lacuscore import LacusCore, CaptureStatus as CaptureStatusCore, CaptureResponse as CaptureResponseCore
|
2024-01-16 00:27:43 +01:00
|
|
|
from pylacus import PyLacus, CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2024-01-13 01:24:32 +01:00
|
|
|
from lookyloo import Lookyloo, CaptureSettings
|
2022-11-23 15:54:22 +01:00
|
|
|
from lookyloo.default import AbstractManager, get_config
|
2022-09-20 14:49:58 +02:00
|
|
|
from lookyloo.helpers import get_captures_dir
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2022-05-02 13:04:55 +02:00
|
|
|
from lookyloo.modules import FOX
|
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2019-01-29 18:37:13 +01:00
|
|
|
|
|
|
|
|
2020-11-05 14:14:33 +01:00
|
|
|
class AsyncCapture(AbstractManager):
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def __init__(self, loglevel: int | None=None) -> None:
|
2019-01-29 18:37:13 +01:00
|
|
|
super().__init__(loglevel)
|
2021-04-09 14:33:40 +02:00
|
|
|
self.script_name = 'async_capture'
|
2021-08-25 13:36:48 +02:00
|
|
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
|
|
|
self.capture_dir: Path = get_captures_dir()
|
2022-11-01 18:10:20 +01:00
|
|
|
self.lookyloo = Lookyloo()
|
2022-09-23 12:32:37 +02:00
|
|
|
|
2023-04-08 13:49:18 +02:00
|
|
|
if isinstance(self.lookyloo.lacus, LacusCore):
|
2024-01-12 17:15:41 +01:00
|
|
|
self.captures: set[asyncio.Task] = set() # type: ignore[type-arg]
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2023-10-11 14:57:36 +02:00
|
|
|
self.fox = FOX(config_name='FOX')
|
2022-05-02 13:04:55 +02:00
|
|
|
if not self.fox.available:
|
|
|
|
self.logger.warning('Unable to setup the FOX module')
|
|
|
|
|
2022-08-04 16:58:07 +02:00
|
|
|
def thirdparty_submit(self, url: str) -> None:
|
2022-05-02 13:04:55 +02:00
|
|
|
if self.fox.available:
|
2022-08-04 16:58:07 +02:00
|
|
|
self.fox.capture_default_trigger(url, auto_trigger=True)
|
2022-05-02 13:04:55 +02:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
async def _trigger_captures(self) -> None:
|
|
|
|
# Only called if LacusCore is used
|
2023-04-08 13:49:18 +02:00
|
|
|
max_new_captures = get_config('generic', 'async_capture_processes') - len(self.captures)
|
|
|
|
self.logger.debug(f'{len(self.captures)} ongoing captures.')
|
|
|
|
if max_new_captures <= 0:
|
|
|
|
self.logger.info(f'Max amount of captures in parallel reached ({len(self.captures)})')
|
2024-01-12 17:15:41 +01:00
|
|
|
return None
|
|
|
|
for capture_task in self.lookyloo.lacus.consume_queue(max_new_captures): # type: ignore[union-attr]
|
2023-04-08 13:49:18 +02:00
|
|
|
self.captures.add(capture_task)
|
|
|
|
capture_task.add_done_callback(self.captures.discard)
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def uuids_ready(self) -> list[str]:
|
2024-02-15 02:36:16 +01:00
|
|
|
'''Get the list of captures ready to be processed'''
|
|
|
|
# Only check if the top 50 in the priority list are done, as they are the most likely ones to be
|
|
|
|
# and if the list it very very long, iterating over it takes a very long time.
|
|
|
|
return [uuid for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf', start=0, num=50)
|
|
|
|
if uuid and self.lookyloo.lacus.get_capture_status(uuid) in [CaptureStatusPy.DONE, CaptureStatusCore.DONE]]
|
2023-04-08 13:49:18 +02:00
|
|
|
|
|
|
|
def process_capture_queue(self) -> None:
|
2021-08-25 13:36:48 +02:00
|
|
|
'''Process a query from the capture queue'''
|
2024-01-12 17:15:41 +01:00
|
|
|
entries: CaptureResponseCore | CaptureResponsePy
|
2023-04-08 13:49:18 +02:00
|
|
|
for uuid in self.uuids_ready():
|
|
|
|
if isinstance(self.lookyloo.lacus, LacusCore):
|
2022-11-01 18:10:20 +01:00
|
|
|
entries = self.lookyloo.lacus.get_capture(uuid, decode=True)
|
2023-04-08 13:49:18 +02:00
|
|
|
elif isinstance(self.lookyloo.lacus, PyLacus):
|
|
|
|
entries = self.lookyloo.lacus.get_capture(uuid)
|
|
|
|
else:
|
|
|
|
raise Exception('Something is broken.')
|
|
|
|
log = f'Got the capture for {uuid} from Lacus'
|
|
|
|
if runtime := entries.get('runtime'):
|
|
|
|
log = f'{log} - Runtime: {runtime}'
|
|
|
|
self.logger.info(log)
|
|
|
|
|
|
|
|
self.lookyloo.redis.sadd('ongoing', uuid)
|
2024-01-12 17:15:41 +01:00
|
|
|
queue: str | None = self.lookyloo.redis.getdel(f'{uuid}_mgmt')
|
2023-04-08 13:49:18 +02:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
to_capture: CaptureSettings = self.lookyloo.redis.hgetall(uuid) # type: ignore[assignment]
|
2023-04-08 13:49:18 +02:00
|
|
|
|
|
|
|
if get_config('generic', 'default_public'):
|
|
|
|
# By default, the captures are on the index, unless the user mark them as un-listed
|
2024-01-26 15:03:36 +01:00
|
|
|
listing = False if ('listing' in to_capture and isinstance(to_capture['listing'], str) and to_capture['listing'].lower() in ['false', '0', '']) else True
|
2023-04-08 13:49:18 +02:00
|
|
|
else:
|
|
|
|
# By default, the captures are not on the index, unless the user mark them as listed
|
2024-01-26 15:03:36 +01:00
|
|
|
listing = True if ('listing' in to_capture and isinstance(to_capture['listing'], str) and to_capture['listing'].lower() in ['true', '1']) else False
|
2023-04-08 13:49:18 +02:00
|
|
|
|
|
|
|
self.lookyloo.store_capture(
|
|
|
|
uuid, listing,
|
|
|
|
os=to_capture.get('os'), browser=to_capture.get('browser'),
|
|
|
|
parent=to_capture.get('parent'),
|
|
|
|
downloaded_filename=entries.get('downloaded_filename'),
|
|
|
|
downloaded_file=entries.get('downloaded_file'),
|
|
|
|
error=entries.get('error'), har=entries.get('har'),
|
|
|
|
png=entries.get('png'), html=entries.get('html'),
|
|
|
|
last_redirected_url=entries.get('last_redirected_url'),
|
2023-05-15 16:08:19 +02:00
|
|
|
cookies=entries.get('cookies'),
|
2023-08-09 16:50:27 +02:00
|
|
|
capture_settings=to_capture,
|
|
|
|
potential_favicons=entries.get('potential_favicons')
|
2023-04-08 13:49:18 +02:00
|
|
|
)
|
|
|
|
|
2023-05-15 16:08:19 +02:00
|
|
|
if 'auto_report' in to_capture:
|
2023-10-20 15:48:28 +02:00
|
|
|
settings = {}
|
2023-10-20 15:55:50 +02:00
|
|
|
if isinstance(to_capture['auto_report'], str):
|
|
|
|
if to_capture['auto_report'].isdigit():
|
|
|
|
# auto_report was a bool in the submission, it can be 1 or 0. 0 means no.
|
|
|
|
if to_capture['auto_report'] == '0':
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
settings = json.loads(to_capture['auto_report'])
|
2023-10-20 15:48:28 +02:00
|
|
|
elif isinstance(to_capture['auto_report'], dict):
|
2023-05-15 16:08:19 +02:00
|
|
|
settings = to_capture['auto_report']
|
2023-10-20 15:48:28 +02:00
|
|
|
|
|
|
|
self.lookyloo.send_mail(uuid, email=settings.get('email', ''),
|
|
|
|
comment=settings.get('comment'))
|
2023-04-28 17:19:49 +02:00
|
|
|
|
2023-04-08 13:49:18 +02:00
|
|
|
lazy_cleanup = self.lookyloo.redis.pipeline()
|
|
|
|
if queue and self.lookyloo.redis.zscore('queues', queue):
|
|
|
|
lazy_cleanup.zincrby('queues', -1, queue)
|
|
|
|
lazy_cleanup.zrem('to_capture', uuid)
|
|
|
|
lazy_cleanup.srem('ongoing', uuid)
|
|
|
|
lazy_cleanup.delete(uuid)
|
|
|
|
# make sure to expire the key if nothing was processed for a while (= queues empty)
|
|
|
|
lazy_cleanup.expire('queues', 600)
|
|
|
|
lazy_cleanup.execute()
|
2022-09-23 21:45:50 +02:00
|
|
|
self.unset_running()
|
2023-04-08 13:49:18 +02:00
|
|
|
self.logger.info(f'Done with {uuid}')
|
2022-10-28 12:40:28 +02:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
async def _to_run_forever_async(self) -> None:
|
2022-10-28 12:40:28 +02:00
|
|
|
if self.force_stop:
|
2024-01-12 17:15:41 +01:00
|
|
|
return None
|
2023-04-08 13:49:18 +02:00
|
|
|
|
|
|
|
if isinstance(self.lookyloo.lacus, LacusCore):
|
|
|
|
await self._trigger_captures()
|
2023-04-09 13:58:34 +02:00
|
|
|
# NOTE: +1 because running this method also counts for one and will
|
|
|
|
# be decremented when it finishes
|
|
|
|
self.set_running(len(self.captures) + 1)
|
2023-04-08 13:49:18 +02:00
|
|
|
|
|
|
|
self.process_capture_queue()
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
async def _wait_to_finish_async(self) -> None:
|
2023-04-08 13:49:18 +02:00
|
|
|
if isinstance(self.lookyloo.lacus, LacusCore):
|
|
|
|
while self.captures:
|
|
|
|
self.logger.info(f'Waiting for {len(self.captures)} capture(s) to finish...')
|
|
|
|
await asyncio.sleep(5)
|
2023-04-09 13:58:34 +02:00
|
|
|
# NOTE: +1 so we don't quit before the final process capture queue
|
|
|
|
self.set_running(len(self.captures) + 1)
|
|
|
|
self.process_capture_queue()
|
|
|
|
self.unset_running()
|
2022-10-28 12:40:28 +02:00
|
|
|
self.logger.info('No more captures')
|
|
|
|
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def main() -> None:
|
2020-11-05 14:14:33 +01:00
|
|
|
m = AsyncCapture()
|
2022-10-28 12:40:28 +02:00
|
|
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
loop.add_signal_handler(signal.SIGTERM, lambda: loop.create_task(m.stop_async()))
|
|
|
|
|
|
|
|
try:
|
|
|
|
loop.run_until_complete(m.run_async(sleep_in_sec=1))
|
|
|
|
finally:
|
|
|
|
loop.close()
|
2020-10-03 21:19:43 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|