AIL-framework/bin/torcrawler/tor_crawler.py

50 lines
1.5 KiB
Python
Raw Normal View History

2018-08-09 17:42:21 +02:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
2019-02-25 16:38:50 +01:00
import json
import redis
2018-08-09 17:42:21 +02:00
import configparser
from TorSplashCrawler import TorSplashCrawler
if __name__ == '__main__':
2019-02-25 16:38:50 +01:00
if len(sys.argv) != 2:
print('usage:', 'tor_crawler.py', 'uuid')
2018-08-09 17:42:21 +02:00
exit(1)
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
2019-02-25 16:38:50 +01:00
redis_cache = redis.StrictRedis(
host=cfg.get("Redis_Cache", "host"),
port=cfg.getint("Redis_Cache", "port"),
db=cfg.getint("Redis_Cache", "db"),
decode_responses=True)
2019-02-22 17:00:24 +01:00
2019-02-25 16:38:50 +01:00
# get crawler config key
uuid = sys.argv[1]
2019-02-22 17:00:24 +01:00
2019-02-25 16:38:50 +01:00
# get configs
crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid)))
2019-02-22 17:00:24 +01:00
2019-02-25 16:38:50 +01:00
splash_url = crawler_json['splash_url']
service_type = crawler_json['service_type']
url = crawler_json['url']
domain = crawler_json['domain']
original_item = crawler_json['item']
crawler_options = crawler_json['crawler_options']
date = crawler_json['date']
2019-02-22 17:00:24 +01:00
2019-02-25 16:38:50 +01:00
redis_cache.delete('crawler_request:{}'.format(uuid))
2019-02-21 09:54:43 +01:00
crawler = TorSplashCrawler(splash_url, crawler_options)
2019-02-25 16:38:50 +01:00
crawler.crawl(service_type, crawler_options, date, url, domain, original_item)