chg: More cleanup, support clean shutdown of multiple async captures

pull/251/head
Raphaël Vinot 2021-08-25 16:40:51 +02:00
parent bc5e41a1ca
commit 407e78ae7f
5 changed files with 33 additions and 40 deletions

View File

@ -12,7 +12,6 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Union, Dict, Optional, Tuple, List from typing import Union, Dict, Optional, Tuple, List
from urllib.parse import urlsplit from urllib.parse import urlsplit
from uuid import uuid4
from defang import refang # type: ignore from defang import refang # type: ignore
from redis import Redis from redis import Redis
@ -39,20 +38,13 @@ class AsyncCapture(AbstractManager):
self.splash_url: str = get_splash_url() self.splash_url: str = get_splash_url()
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def process_capture_queue(self) -> Union[bool, None]: def process_capture_queue(self) -> None:
'''Process a query from the capture queue''' '''Process a query from the capture queue'''
if not self.redis.exists('to_capture'):
return None
status, message = splash_status()
if not status:
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
return None
value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore value: Optional[List[Tuple[str, int]]] = self.redis.zpopmax('to_capture') # type: ignore
if not value or not value[0]: if not value or not value[0]:
return None # The queue was consumed by an other process.
uuid, score = value[0] return
uuid, _score = value[0]
queue: Optional[str] = self.redis.get(f'{uuid}_mgmt') queue: Optional[str] = self.redis.get(f'{uuid}_mgmt')
self.redis.sadd('ongoing', uuid) self.redis.sadd('ongoing', uuid)
@ -67,22 +59,20 @@ class AsyncCapture(AbstractManager):
if 'cookies' in to_capture: if 'cookies' in to_capture:
to_capture['cookies_pseudofile'] = to_capture.pop('cookies') to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
status = self._capture(**to_capture) # type: ignore if self._capture(**to_capture): # type: ignore
self.logger.info(f'Processed {to_capture["url"]}')
else:
self.logger.warning(f'Unable to capture {to_capture["url"]}')
lazy_cleanup.srem('ongoing', uuid) lazy_cleanup.srem('ongoing', uuid)
lazy_cleanup.delete(uuid) lazy_cleanup.delete(uuid)
# make sure to expire the key if nothing was process for a while (= queues empty) # make sure to expire the key if nothing was processed for a while (= queues empty)
lazy_cleanup.expire('queues', 600) lazy_cleanup.expire('queues', 600)
lazy_cleanup.execute() lazy_cleanup.execute()
if status:
self.logger.info(f'Processed {to_capture["url"]}')
return True
self.logger.warning(f'Unable to capture {to_capture["url"]}')
return False
def _capture(self, url: str, *, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, def _capture(self, url: str, *, perma_uuid: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: str='', proxy: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, referer: str='', proxy: str='', os: Optional[str]=None,
browser: Optional[str]=None, parent: Optional[str]=None) -> Union[bool, str]: browser: Optional[str]=None, parent: Optional[str]=None) -> bool:
'''Launch a capture''' '''Launch a capture'''
url = url.strip() url = url.strip()
url = refang(url) url = refang(url)
@ -113,8 +103,6 @@ class AsyncCapture(AbstractManager):
if int(depth) > int(get_config('generic', 'max_depth')): if int(depth) > int(get_config('generic', 'max_depth')):
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}') self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
depth = int(get_config('generic', 'max_depth')) depth = int(get_config('generic', 'max_depth'))
if not perma_uuid:
perma_uuid = str(uuid4())
self.logger.info(f'Capturing {url}') self.logger.info(f'Capturing {url}')
try: try:
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
@ -182,12 +170,17 @@ class AsyncCapture(AbstractManager):
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies: with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies) json.dump(cookies, _cookies)
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath)) self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
return perma_uuid return True
def _to_run_forever(self): def _to_run_forever(self):
while True: while self.redis.exists('to_capture'):
url = self.process_capture_queue() status, message = splash_status()
if url is None or shutdown_requested(): if not status:
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
break
self.process_capture_queue()
if shutdown_requested():
break break

View File

@ -3,11 +3,11 @@
from lookyloo.helpers import is_running, get_socket_path from lookyloo.helpers import is_running, get_socket_path
import time import time
from redis import StrictRedis from redis import Redis
def main(): def main():
r = StrictRedis(unix_socket_path=get_socket_path('cache'), db=1) r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
r.set('shutdown', 1) r.set('shutdown', 1)
time.sleep(5) time.sleep(5)
while True: while True:

View File

@ -4,13 +4,10 @@
import time import time
import signal import signal
from subprocess import Popen from subprocess import Popen
from lookyloo.helpers import get_homedir, shutdown_requested, set_running, unset_running, get_socket_path, get_config from lookyloo.helpers import get_homedir, shutdown_requested, set_running, unset_running, get_config
from redis import StrictRedis
def main(): def main():
r = StrictRedis(unix_socket_path=get_socket_path('cache'))
r.delete('cache_loaded')
website_dir = get_homedir() / 'website' website_dir = get_homedir() / 'website'
ip = get_config('generic', 'website_listen_ip') ip = get_config('generic', 'website_listen_ip')
port = get_config('generic', 'website_listen_port') port = get_config('generic', 'website_listen_port')

View File

@ -53,7 +53,8 @@
"f1c33e72643ce366fd578e3b5d393799e8c9ea27b180987826af43b4fc00b65a4eaae5e6426a23448956fee99e3108c6a86f32fb4896c156e24af0571a11c498", "f1c33e72643ce366fd578e3b5d393799e8c9ea27b180987826af43b4fc00b65a4eaae5e6426a23448956fee99e3108c6a86f32fb4896c156e24af0571a11c498",
"dc7c40381b3d22919e32c1b700ccb77b1b0aea2690642d01c1ac802561e135c01d5a4d2a0ea18efc0ec3362e8c549814a10a23563f1f56bd62aee0ced7e2bd99", "dc7c40381b3d22919e32c1b700ccb77b1b0aea2690642d01c1ac802561e135c01d5a4d2a0ea18efc0ec3362e8c549814a10a23563f1f56bd62aee0ced7e2bd99",
"c2c239cb5cdd0b670780ad6414ef6be9ccd4c21ce46bb93d1fa3120ac812f1679445162978c3df05cb2e1582a1844cc4c41cf74960b8fdae3123999c5d2176cc", "c2c239cb5cdd0b670780ad6414ef6be9ccd4c21ce46bb93d1fa3120ac812f1679445162978c3df05cb2e1582a1844cc4c41cf74960b8fdae3123999c5d2176cc",
"6ad523f5b65487369d305613366b9f68dcdeee225291766e3b25faf45439ca069f614030c08ca54c714fdbf7a944fac489b1515a8bf9e0d3191e1bcbbfe6a9df" "6ad523f5b65487369d305613366b9f68dcdeee225291766e3b25faf45439ca069f614030c08ca54c714fdbf7a944fac489b1515a8bf9e0d3191e1bcbbfe6a9df",
"5065931218ce18ded3a022bd14e8208247f6d0900fff3b41901f9dba45dc417d84e386549e64446f390073431ed23a83d9f4c018da389d2e43f59c26febfc0de"
] ]
}, },
"empty_svg" : { "empty_svg" : {

View File

@ -170,18 +170,20 @@ def safe_create_dir(to_create: Path) -> None:
def set_running(name: str) -> None: def set_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True) r = Redis(unix_socket_path=get_socket_path('cache'), db=1)
r.hset('running', name, 1) r.zincrby('running', 1, name)
def unset_running(name: str) -> None: def unset_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True) r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hdel('running', name) current_running = r.zincrby('running', -1, name)
if int(current_running) <= 0:
r.zrem('running', name)
def is_running() -> Dict[Any, Any]: def is_running() -> List[Tuple[str, float]]:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True) r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
return r.hgetall('running') return r.zrangebyscore('running', '-inf', '+inf', withscores=True)
def get_socket_path(name: str) -> str: def get_socket_path(name: str) -> str: