AIL-framework/bin/core/Crawler_manager.py

69 lines
2.3 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import ConfigLoader
import crawlers
config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
2020-07-27 15:46:09 +02:00
# # TODO: lauch me in core screen
# # TODO: check if already launched in tor screen
# # TODO: handle mutltiple splash_manager
if __name__ == '__main__':
2020-07-27 15:46:09 +02:00
is_manager_connected = crawlers.ping_splash_manager()
if not is_manager_connected:
print('Error, Can\'t connect to Splash manager')
session_uuid = None
else:
print('Splash manager connected')
session_uuid = crawlers.get_splash_manager_session_uuid()
is_manager_connected = crawlers.reload_splash_and_proxies_list()
print(is_manager_connected)
if is_manager_connected:
if crawlers.test_ail_crawlers():
crawlers.relaunch_crawlers()
2020-07-27 15:46:09 +02:00
last_check = int(time.time())
while True:
# # TODO: avoid multiple ping
2020-07-27 15:46:09 +02:00
# check if manager is connected
if int(time.time()) - last_check > 60:
is_manager_connected = crawlers.is_splash_manager_connected()
current_session_uuid = crawlers.get_splash_manager_session_uuid()
# reload proxy and splash list
if current_session_uuid and current_session_uuid != session_uuid:
is_manager_connected = crawlers.reload_splash_and_proxies_list()
if is_manager_connected:
print('reload proxies and splash list')
if crawlers.test_ail_crawlers():
crawlers.relaunch_crawlers()
2020-07-27 15:46:09 +02:00
session_uuid = current_session_uuid
if not is_manager_connected:
print('Error, Can\'t connect to Splash manager')
last_check = int(time.time())
# # TODO: lauch crawlers if was never connected
# refresh splash and proxy list
2020-07-27 15:46:09 +02:00
elif False:
crawlers.reload_splash_and_proxies_list()
print('list of splash and proxies refreshed')
else:
2020-07-27 15:46:09 +02:00
time.sleep(5)
# kill/launch new crawler / crawler manager check if already launched
# # TODO: handle mutltiple splash_manager
2020-07-27 15:46:09 +02:00
# catch reload request