2019-01-29 18:37:13 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
import asyncio
|
2019-01-29 18:37:13 +01:00
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2022-10-28 12:40:28 +02:00
|
|
|
import signal
|
|
|
|
import time
|
2022-08-04 16:58:07 +02:00
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
from pathlib import Path
|
2022-10-28 12:40:28 +02:00
|
|
|
from typing import Dict, Optional, Union
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-10-26 14:25:23 +02:00
|
|
|
from lacuscore import LacusCore, CaptureStatus as CaptureStatusCore, CaptureResponse as CaptureResponseCore
|
2022-11-01 18:10:20 +01:00
|
|
|
from pylacus import CaptureStatus as CaptureStatusPy, CaptureResponse as CaptureResponsePy
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2022-11-01 18:10:20 +01:00
|
|
|
from lookyloo.lookyloo import Lookyloo
|
2022-11-23 15:54:22 +01:00
|
|
|
from lookyloo.default import AbstractManager, get_config
|
2022-09-20 14:49:58 +02:00
|
|
|
from lookyloo.helpers import get_captures_dir
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2022-05-02 13:04:55 +02:00
|
|
|
from lookyloo.modules import FOX
|
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2019-01-29 18:37:13 +01:00
|
|
|
|
|
|
|
|
2020-11-05 14:14:33 +01:00
|
|
|
class AsyncCapture(AbstractManager):
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2021-03-12 16:49:04 +01:00
|
|
|
def __init__(self, loglevel: int=logging.INFO):
|
2019-01-29 18:37:13 +01:00
|
|
|
super().__init__(loglevel)
|
2021-04-09 14:33:40 +02:00
|
|
|
self.script_name = 'async_capture'
|
2021-08-25 13:36:48 +02:00
|
|
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
|
|
|
self.capture_dir: Path = get_captures_dir()
|
2022-11-01 18:10:20 +01:00
|
|
|
self.lookyloo = Lookyloo()
|
2022-09-23 12:32:37 +02:00
|
|
|
|
2022-10-28 12:40:28 +02:00
|
|
|
self.captures: Dict[asyncio.Task, float] = {}
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-05-02 13:04:55 +02:00
|
|
|
self.fox = FOX(get_config('modules', 'FOX'))
|
|
|
|
if not self.fox.available:
|
|
|
|
self.logger.warning('Unable to setup the FOX module')
|
|
|
|
|
2022-08-04 16:58:07 +02:00
|
|
|
def thirdparty_submit(self, url: str) -> None:
|
2022-05-02 13:04:55 +02:00
|
|
|
if self.fox.available:
|
2022-08-04 16:58:07 +02:00
|
|
|
self.fox.capture_default_trigger(url, auto_trigger=True)
|
2022-05-02 13:04:55 +02:00
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
async def process_capture_queue(self) -> None:
|
2021-08-25 13:36:48 +02:00
|
|
|
'''Process a query from the capture queue'''
|
2022-09-23 21:33:38 +02:00
|
|
|
self.set_running()
|
2022-09-23 21:45:50 +02:00
|
|
|
uuid: Optional[str] = None
|
2022-10-26 14:25:23 +02:00
|
|
|
entries: Union[CaptureResponseCore, CaptureResponsePy]
|
2022-11-01 18:10:20 +01:00
|
|
|
if isinstance(self.lookyloo.lacus, LacusCore):
|
|
|
|
if uuid := await self.lookyloo.lacus.consume_queue():
|
|
|
|
entries = self.lookyloo.lacus.get_capture(uuid, decode=True)
|
2022-09-23 21:45:50 +02:00
|
|
|
if entries['status'] != CaptureStatusCore.DONE:
|
|
|
|
self.logger.warning(f'The capture {uuid} is reported as not done ({entries["status"]}) when it should.')
|
2022-11-01 18:10:20 +01:00
|
|
|
self.lookyloo.redis.zrem('to_capture', uuid)
|
|
|
|
self.lookyloo.redis.delete(uuid)
|
2022-09-23 12:32:37 +02:00
|
|
|
else:
|
|
|
|
# Find a capture that is done
|
2022-09-29 15:42:05 +02:00
|
|
|
try:
|
2022-11-01 18:10:20 +01:00
|
|
|
for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
|
2022-09-29 15:42:05 +02:00
|
|
|
if not uuid:
|
|
|
|
break
|
2022-11-01 18:10:20 +01:00
|
|
|
entries = self.lookyloo.lacus.get_capture(uuid)
|
2022-09-29 15:42:05 +02:00
|
|
|
if entries['status'] == CaptureStatusPy.DONE:
|
2022-10-28 12:40:28 +02:00
|
|
|
log = f'Got the capture for {uuid} from Lacus'
|
|
|
|
if runtime := entries.get('runtime'):
|
|
|
|
log = f'{log} - Runtime: {runtime}'
|
|
|
|
self.logger.info(log)
|
2022-09-29 15:42:05 +02:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
# No captures are ready
|
|
|
|
uuid = None
|
|
|
|
except Exception as e:
|
|
|
|
self.logger.critical(f'Error when getting captures from lacus, will retry later: {e}')
|
2022-09-27 15:44:17 +02:00
|
|
|
uuid = None
|
2022-11-01 18:10:20 +01:00
|
|
|
await asyncio.sleep(10)
|
2022-09-27 15:44:17 +02:00
|
|
|
|
2022-09-23 21:45:50 +02:00
|
|
|
if uuid is None:
|
|
|
|
self.unset_running()
|
|
|
|
return
|
2022-09-23 12:32:37 +02:00
|
|
|
|
2022-11-01 18:10:20 +01:00
|
|
|
self.lookyloo.redis.sadd('ongoing', uuid)
|
|
|
|
queue: Optional[str] = self.lookyloo.redis.getdel(f'{uuid}_mgmt')
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-11-01 18:10:20 +01:00
|
|
|
to_capture: Dict[str, str] = self.lookyloo.redis.hgetall(uuid)
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-08-18 11:19:32 +02:00
|
|
|
if get_config('generic', 'default_public'):
|
|
|
|
# By default, the captures are on the index, unless the user mark them as un-listed
|
2022-11-01 18:10:20 +01:00
|
|
|
listing = False if ('listing' in to_capture and to_capture['listing'].lower() in ['false', '0', '']) else True
|
2022-08-18 11:19:32 +02:00
|
|
|
else:
|
|
|
|
# By default, the captures are not on the index, unless the user mark them as listed
|
2022-11-01 18:10:20 +01:00
|
|
|
listing = True if ('listing' in to_capture and to_capture['listing'].lower() in ['true', '1']) else False
|
2022-08-18 11:19:32 +02:00
|
|
|
|
2022-11-19 01:32:03 +01:00
|
|
|
self.lookyloo.store_capture(
|
|
|
|
uuid, listing,
|
|
|
|
os=to_capture.get('os'), browser=to_capture.get('os'),
|
|
|
|
parent=to_capture.get('parent'),
|
|
|
|
downloaded_filename=entries.get('downloaded_filename'),
|
|
|
|
downloaded_file=entries.get('downloaded_file'),
|
|
|
|
error=entries.get('error'), har=entries.get('har'),
|
|
|
|
png=entries.get('png'), html=entries.get('html'),
|
|
|
|
last_redirected_url=entries.get('last_redirected_url'),
|
|
|
|
cookies=entries.get('cookies') # type: ignore
|
|
|
|
)
|
2022-08-18 11:19:32 +02:00
|
|
|
|
2022-11-01 18:10:20 +01:00
|
|
|
lazy_cleanup = self.lookyloo.redis.pipeline()
|
|
|
|
if queue and self.lookyloo.redis.zscore('queues', queue):
|
2022-09-23 21:33:38 +02:00
|
|
|
lazy_cleanup.zincrby('queues', -1, queue)
|
|
|
|
lazy_cleanup.zrem('to_capture', uuid)
|
|
|
|
lazy_cleanup.srem('ongoing', uuid)
|
|
|
|
lazy_cleanup.delete(uuid)
|
|
|
|
# make sure to expire the key if nothing was processed for a while (= queues empty)
|
|
|
|
lazy_cleanup.expire('queues', 600)
|
|
|
|
lazy_cleanup.execute()
|
|
|
|
self.unset_running()
|
2022-09-29 15:42:05 +02:00
|
|
|
self.logger.info(f'Done with {uuid}')
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2022-10-28 12:40:28 +02:00
|
|
|
async def cancel_old_captures(self):
|
|
|
|
cancelled_tasks = []
|
|
|
|
for task, timestamp in self.captures.items():
|
|
|
|
if time.time() - timestamp >= get_config('generic', 'max_capture_time'):
|
|
|
|
task.cancel()
|
|
|
|
cancelled_tasks.append(task)
|
|
|
|
self.logger.warning('A capture has been going for too long, canceling it.')
|
|
|
|
if cancelled_tasks:
|
|
|
|
await asyncio.gather(*cancelled_tasks, return_exceptions=True)
|
|
|
|
|
2022-04-21 13:53:42 +02:00
|
|
|
async def _to_run_forever_async(self):
|
2022-10-28 12:40:28 +02:00
|
|
|
await self.cancel_old_captures()
|
|
|
|
if self.force_stop:
|
|
|
|
return
|
2022-09-20 14:49:58 +02:00
|
|
|
capture = asyncio.create_task(self.process_capture_queue())
|
2022-10-28 12:40:28 +02:00
|
|
|
self.captures[capture] = time.time()
|
|
|
|
capture.add_done_callback(self.captures.pop)
|
2022-09-20 14:49:58 +02:00
|
|
|
while len(self.captures) >= get_config('generic', 'async_capture_processes'):
|
2022-10-28 12:40:28 +02:00
|
|
|
await self.cancel_old_captures()
|
2022-09-20 14:49:58 +02:00
|
|
|
await asyncio.sleep(1)
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2022-10-28 12:40:28 +02:00
|
|
|
async def _wait_to_finish(self):
|
|
|
|
while self.captures:
|
|
|
|
self.logger.info(f'Waiting for {len(self.captures)} capture(s) to finish...')
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
self.logger.info('No more captures')
|
|
|
|
|
2019-01-29 18:37:13 +01:00
|
|
|
|
2020-10-03 21:19:43 +02:00
|
|
|
def main():
|
2020-11-05 14:14:33 +01:00
|
|
|
m = AsyncCapture()
|
2022-10-28 12:40:28 +02:00
|
|
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
loop.add_signal_handler(signal.SIGTERM, lambda: loop.create_task(m.stop_async()))
|
|
|
|
|
|
|
|
try:
|
|
|
|
loop.run_until_complete(m.run_async(sleep_in_sec=1))
|
|
|
|
finally:
|
|
|
|
loop.close()
|
2020-10-03 21:19:43 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|