mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			chg: [Crawler] major refractor
							parent
							
								
									60f7645ac1
								
							
						
					
					
						commit
						7b32d7f34e
					
				
							
								
								
									
										183
									
								
								bin/Crawler.py
								
								
								
								
							
							
						
						
									
										183
									
								
								bin/Crawler.py
								
								
								
								
							|  | @ -4,6 +4,8 @@ | |||
| import os | ||||
| import sys | ||||
| import re | ||||
| import uuid | ||||
| import json | ||||
| import redis | ||||
| import datetime | ||||
| import time | ||||
|  | @ -30,19 +32,39 @@ def load_blacklist(service_type): | |||
| 
 | ||||
| # Extract info form url (url, domain, domain url, ...) | ||||
| def unpack_url(url): | ||||
|     to_crawl = {} | ||||
|     faup.decode(url) | ||||
|     url_unpack = faup.get() | ||||
|     domain = url_unpack['domain'].decode() | ||||
|     to_crawl['domain'] = url_unpack['domain'].decode() | ||||
| 
 | ||||
|     if url_unpack['scheme'] is None: | ||||
|         to_crawl['scheme'] = url_unpack['scheme'] | ||||
|         to_crawl['url']= 'http://{}'.format(url) | ||||
|         to_crawl['domain_url'] = 'http://{}'.format(domain) | ||||
|         to_crawl['scheme'] = 'http' | ||||
|     else: | ||||
|         to_crawl['scheme'] = url_unpack['scheme'] | ||||
|         to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url) | ||||
|         to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], domain) | ||||
|     to_crawl['port'] = url_unpack['port'] | ||||
|     to_crawl['tld'] = url_unpack['tld'].deocode() | ||||
|         scheme = url_unpack['scheme'].decode() | ||||
|         if scheme in default_proto_map: | ||||
|             to_crawl['scheme'] = scheme | ||||
|         else: | ||||
|             to_crawl['scheme'] = 'http' | ||||
| 
 | ||||
|     if url_unpack['port'] is None: | ||||
|         to_crawl['port'] = default_proto_map[to_crawl['scheme']] | ||||
|     else: | ||||
|         port = url_unpack['port'].decode() | ||||
|         # Verify port number                        #################### make function to verify/correct port number | ||||
|         try: | ||||
|             int(port) | ||||
|         # Invalid port Number | ||||
|         except Exception as e: | ||||
|             port = default_proto_map[to_crawl['scheme']] | ||||
|         to_crawl['port'] = port | ||||
| 
 | ||||
|     if url_unpack['query_string'] is None: | ||||
|         to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) | ||||
|     else: | ||||
|         to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode()) | ||||
|     to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], to_crawl['domain'], to_crawl['port']) | ||||
| 
 | ||||
|     to_crawl['tld'] = url_unpack['tld'].decode() | ||||
|     return to_crawl | ||||
| 
 | ||||
| # get url, paste and service_type to crawl | ||||
|  | @ -70,20 +92,45 @@ def get_elem_to_crawl(rotation_mode): | |||
|             url, paste = splitted | ||||
|             if paste: | ||||
|                 paste = paste.replace(PASTES_FOLDER+'/', '') | ||||
|         else: | ||||
|             url = message | ||||
|             paste = 'requested' | ||||
| 
 | ||||
|         message = {'url': url, 'paste': paste, 'type_service': domain_service_type, 'original_message': message} | ||||
| 
 | ||||
|     return message | ||||
| 
 | ||||
| def load_crawler_config(service_type, domain, paste): | ||||
| def get_crawler_config(redis_server, mode, service_type, domain): | ||||
|     crawler_options = {} | ||||
|     config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) | ||||
|     if config is None: | ||||
|         config = {} | ||||
|     else: | ||||
|         config = json.loads(config) | ||||
|     for option in default_crawler_config: | ||||
|         if option in config: | ||||
|             crawler_options[option] = config[option] | ||||
|         else: | ||||
|             crawler_options[option] = default_crawler_config[option] | ||||
|     return crawler_options | ||||
| 
 | ||||
| def load_crawler_config(service_type, domain, paste, date): | ||||
|     crawler_config = {} | ||||
|     crawler_config['splash_url'] = splash_url | ||||
|     crawler_config['item'] = paste | ||||
|     crawler_config['service_type'] = service_type | ||||
|     crawler_config['domain'] = domain | ||||
|     crawler_config['date'] = date | ||||
| 
 | ||||
|     # Auto and Manual Crawling | ||||
|     if paste is None: | ||||
|     # Auto ################################################# create new entry, next crawling => here or when ended ? | ||||
|     if paste == 'auto': | ||||
|         crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain) | ||||
|         crawler_config['requested'] = True | ||||
|     # Manual | ||||
|     elif paste == 'manual': | ||||
|         crawler_config['crawler_options'] = get_crawler_config(r_cache, 'manual', service_type, domain) | ||||
|         crawler_config['requested'] = True | ||||
|     # default crawler | ||||
|     else: | ||||
|         crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'default', service_type, domain) | ||||
|         crawler_config['requested'] = False | ||||
|     return crawler_config | ||||
| 
 | ||||
|  | @ -108,17 +155,13 @@ def on_error_send_message_back_in_queue(type_service, domain, message): | |||
|         redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain) | ||||
|         redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message) | ||||
| 
 | ||||
| ##########################################################################################################< | ||||
| def crawl_onion(url, domain, message, crawler_config): | ||||
|     crawler_config['url'] = url | ||||
|     print('Launching Crawler: {}'.format(url)) | ||||
| 
 | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S")) | ||||
| 
 | ||||
|     super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') | ||||
|     if super_father is None: | ||||
|         super_father=paste | ||||
| 
 | ||||
|     retry = True | ||||
|     nb_retry = 0 | ||||
|     while retry: | ||||
|  | @ -144,7 +187,11 @@ def crawl_onion(url, domain, message, crawler_config): | |||
| 
 | ||||
|     if r.status_code == 200: | ||||
|         r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') | ||||
|         process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_service, url, domain, paste, super_father], | ||||
|         # save config in cash | ||||
|         UUID = str(uuid.uuid4()) | ||||
|         r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) | ||||
| 
 | ||||
|         process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', UUID], | ||||
|                                    stdout=subprocess.PIPE) | ||||
|         while process.poll() is None: | ||||
|             time.sleep(1) | ||||
|  | @ -193,16 +240,15 @@ def search_potential_source_domain(type_service, domain): | |||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     if len(sys.argv) != 3: | ||||
|         print('usage:', 'Crawler.py', 'type_service (onion or i2p or regular)', 'splash_port') | ||||
|         print('usage:', 'Crawler.py', 'mode', 'splash_port') | ||||
|         exit(1) | ||||
| ################################################## | ||||
|     type_service = sys.argv[1] | ||||
|     #mode = sys.argv[1] | ||||
|     splash_port = sys.argv[2] | ||||
| 
 | ||||
|     rotation_mode = ['onion', 'regular'] | ||||
| 
 | ||||
|     default_port = ['http': 80, 'https': 443] | ||||
| 
 | ||||
|     default_proto_map = {'http': 80, 'https': 443} | ||||
| ######################################################## add ftp ??? | ||||
| ################################################################### # TODO: port | ||||
| 
 | ||||
|     publisher.port = 6380 | ||||
|  | @ -216,8 +262,6 @@ if __name__ == '__main__': | |||
|     splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port) | ||||
|     print('splash url: {}'.format(splash_url)) | ||||
| 
 | ||||
|     faup = Faup() | ||||
| 
 | ||||
|     PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) | ||||
| 
 | ||||
|     r_serv_metadata = redis.StrictRedis( | ||||
|  | @ -238,6 +282,16 @@ if __name__ == '__main__': | |||
|         db=p.config.getint("ARDB_Onion", "db"), | ||||
|         decode_responses=True) | ||||
| 
 | ||||
|     faup = Faup() | ||||
| 
 | ||||
|     # Default crawler options | ||||
|     default_crawler_config = {'html': 1, | ||||
|                               'har': 1, | ||||
|                               'png': 1, | ||||
|                               'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), | ||||
|                               'closespider_pagecount': 50, | ||||
|                               'user_agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'} | ||||
| 
 | ||||
|     # Track launched crawler | ||||
|     r_cache.sadd('all_crawler', splash_port) | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') | ||||
|  | @ -250,8 +304,12 @@ if __name__ == '__main__': | |||
|     while True: | ||||
| 
 | ||||
|         to_crawl = get_elem_to_crawl(rotation_mode) | ||||
|         if to_crawl_dict: | ||||
|         if to_crawl: | ||||
|             print(to_crawl) | ||||
|             print(to_crawl['url']) | ||||
|             url_data = unpack_url(to_crawl['url']) | ||||
|             print('url') | ||||
|             print(url_data) | ||||
|             # remove domain from queue | ||||
|             redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain']) | ||||
| 
 | ||||
|  | @ -265,20 +323,21 @@ if __name__ == '__main__': | |||
|             print('domain_url:  {}'.format(url_data['domain_url'])) | ||||
| 
 | ||||
|             # Check blacklist | ||||
|             if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain'])): | ||||
|                 date = {'date_day'= datetime.datetime.now().strftime("%Y%m%d"), | ||||
|                         'date_month'= datetime.datetime.now().strftime("%Y%m"), | ||||
|                         'epoch'= int(time.time())} | ||||
|             if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']): | ||||
|                 date = {'date_day': datetime.datetime.now().strftime("%Y%m%d"), | ||||
|                         'date_month': datetime.datetime.now().strftime("%Y%m"), | ||||
|                         'epoch': int(time.time())} | ||||
| 
 | ||||
| 
 | ||||
|                 crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste']) | ||||
|                 crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date) | ||||
|                 print(crawler_config) | ||||
|                 # check if default crawler | ||||
|                 if not crawler_config['requested']: | ||||
|                     # Auto crawl only if service not up this month | ||||
|                     if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): | ||||
|                         continue | ||||
|                 #if not crawler_config['requested']: | ||||
|                 #    # Auto crawl only if service not up this month | ||||
|                 #    if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): | ||||
|                 #        continue | ||||
| 
 | ||||
|                 set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste'], crawler_config) | ||||
|                 set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste']) | ||||
| 
 | ||||
| 
 | ||||
|                 #### CRAWLER #### | ||||
|  | @ -287,14 +346,14 @@ if __name__ == '__main__': | |||
| 
 | ||||
|                 ######################################################crawler strategy | ||||
|                     # CRAWL domain | ||||
|                     crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) | ||||
|                     crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config) | ||||
| 
 | ||||
|                 # Default Crawler | ||||
|                 else: | ||||
|                     # CRAWL domain | ||||
|                     crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message']) | ||||
|                     if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): | ||||
|                         crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) | ||||
|                     crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config) | ||||
|                     #if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): | ||||
|                     #    crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) | ||||
| 
 | ||||
| 
 | ||||
|                         ################################################### handle port | ||||
|  | @ -304,34 +363,34 @@ if __name__ == '__main__': | |||
|                         ####         #### | ||||
| 
 | ||||
| 
 | ||||
|                     # Save last_status day (DOWN) | ||||
|                     if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): | ||||
|                         redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['day']), url_data['domain']) | ||||
|                 # Save last_status day (DOWN) | ||||
|                 if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): | ||||
|                     redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain']) | ||||
| 
 | ||||
|                     # if domain was UP at least one time | ||||
|                     if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): | ||||
|                         # add crawler history (if domain is down) | ||||
|                         if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): | ||||
|                             # Domain is down | ||||
|                             redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) | ||||
|                 # if domain was UP at least one time | ||||
|                 if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): | ||||
|                     # add crawler history (if domain is down) | ||||
|                     if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): | ||||
|                         # Domain is down | ||||
|                         redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) | ||||
| 
 | ||||
|                         ############################ | ||||
|                         # extract page content | ||||
|                         ############################ | ||||
| 
 | ||||
|                     # update list, last crawled domains | ||||
|                     redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain']) | ||||
|                     redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) | ||||
|                 # update list, last crawled domains | ||||
|                 redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain']) | ||||
|                 redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) | ||||
| 
 | ||||
|                     #update crawler status | ||||
|                     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') | ||||
|                     r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') | ||||
|                 else: | ||||
|                     print('                 Blacklisted Domain') | ||||
|                     print() | ||||
|                     print() | ||||
|                 #update crawler status | ||||
|                 r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') | ||||
|                 r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') | ||||
| 
 | ||||
|                 time.sleep(60) | ||||
|             else: | ||||
|                 continue | ||||
|                 print('                 Blacklisted Domain') | ||||
|                 print() | ||||
|                 print() | ||||
| 
 | ||||
|         else: | ||||
|             time.sleep(1) | ||||
|  |  | |||
|  | @ -76,10 +76,12 @@ class HiddenServices(object): | |||
|             pass | ||||
| 
 | ||||
|     def get_origin_paste_name(self): | ||||
|         origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') | ||||
|         if origin_paste is None: | ||||
|         origin_item = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') | ||||
|         if origin_item is None: | ||||
|             return '' | ||||
|         return origin_paste.replace(self.paste_directory+'/', '') | ||||
|         elif origin_item == 'auto' or origin_item == 'manual': | ||||
|             return origin_item | ||||
|         return origin_item.replace(self.paste_directory+'/', '') | ||||
| 
 | ||||
|     def get_domain_tags(self, update=False): | ||||
|         if not update: | ||||
|  | @ -88,18 +90,36 @@ class HiddenServices(object): | |||
|             self.get_last_crawled_pastes() | ||||
|             return self.tags | ||||
| 
 | ||||
|     def update_domain_tags(self, children): | ||||
|         p_tags = self.r_serv_metadata.smembers('tag:'+children) | ||||
|     def update_domain_tags(self, item): | ||||
|         if self.r_serv_metadata.exists('tag:{}'.format(item)): | ||||
|             p_tags = self.r_serv_metadata.smembers('tag:{}'.format(item)) | ||||
|         # update path here | ||||
|         else: | ||||
|             # need to remove it | ||||
|             if self.paste_directory in item: | ||||
|                 p_tags = self.r_serv_metadata.smembers('tag:{}'.format(item.replace(self.paste_directory+'/', ''))) | ||||
|             # need to remove it | ||||
|             else: | ||||
|                 p_tags = self.r_serv_metadata.smembers('tag:{}'.format(os.path.join(self.paste_directory, item))) | ||||
|         print(p_tags) | ||||
|         for tag in p_tags: | ||||
|             self.tags[tag] = self.tags.get(tag, 0) + 1 | ||||
| 
 | ||||
|     #todo use the right paste | ||||
|     def get_last_crawled_pastes(self): | ||||
|         paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') | ||||
|         #paste_parent = paste_parent.replace(self.paste_directory, '')[1:] | ||||
|         return self.get_all_pastes_domain(paste_parent) | ||||
|         paste_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0)[0] | ||||
|         return self.get_all_pastes_domain(paste_root) | ||||
| 
 | ||||
|     def get_all_pastes_domain(self, father): | ||||
|     def get_all_pastes_domain(self, root_item): | ||||
|         if root_item is None: | ||||
|             return [] | ||||
|         l_crawled_pastes = [] | ||||
|         l_crawled_pastes = self.get_item_crawled_children(root_item) | ||||
|         l_crawled_pastes.append(root_item) | ||||
|         self.update_domain_tags(root_item) | ||||
|         return l_crawled_pastes | ||||
| 
 | ||||
|     def get_item_crawled_children(self, father): | ||||
|         if father is None: | ||||
|             return [] | ||||
|         l_crawled_pastes = [] | ||||
|  | @ -112,9 +132,10 @@ class HiddenServices(object): | |||
|             if self.domain in children: | ||||
|                 l_crawled_pastes.append(children) | ||||
|                 self.update_domain_tags(children) | ||||
|                 l_crawled_pastes.extend(self.get_all_pastes_domain(children)) | ||||
|                 l_crawled_pastes.extend(self.get_item_crawled_children(children)) | ||||
|         return l_crawled_pastes | ||||
| 
 | ||||
|     # experimental | ||||
|     def get_domain_son(self, l_paste): | ||||
|         if l_paste is None: | ||||
|             return None | ||||
|  | @ -133,6 +154,7 @@ class HiddenServices(object): | |||
| 
 | ||||
|         return set_domain | ||||
| 
 | ||||
|     ''' | ||||
|     def get_all_domain_son(self, father): | ||||
|         if father is None: | ||||
|             return [] | ||||
|  | @ -149,6 +171,7 @@ class HiddenServices(object): | |||
|                 l_crawled_pastes.extend(self.get_all_domain_son(children)) | ||||
| 
 | ||||
|         return l_crawled_pastes | ||||
|     ''' | ||||
| 
 | ||||
|     def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): | ||||
|         l_screenshot_paste = [] | ||||
|  | @ -176,6 +199,7 @@ class HiddenServices(object): | |||
|         l_crawled_pastes = [] | ||||
|         return l_crawled_pastes | ||||
| 
 | ||||
|     ''' | ||||
|     def get_last_crawled_pastes_fileSearch(self): | ||||
| 
 | ||||
|         last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') | ||||
|  | @ -185,3 +209,4 @@ class HiddenServices(object): | |||
|         pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) | ||||
|         l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] | ||||
|         return l_crawled_pastes | ||||
|     ''' | ||||
|  |  | |||
|  | @ -46,23 +46,23 @@ class TorSplashCrawler(): | |||
|             'DEPTH_LIMIT': crawler_options['depth_limit'] | ||||
|             }) | ||||
| 
 | ||||
|     def crawl(self, type, crawler_options, url, domain, original_paste, super_father): | ||||
|         self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father) | ||||
|     def crawl(self, type, crawler_options, date, url, domain, original_item): | ||||
|         self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item) | ||||
|         self.process.start() | ||||
| 
 | ||||
|     class TorSplashSpider(Spider): | ||||
|         name = 'TorSplashSpider' | ||||
| 
 | ||||
|         def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs): | ||||
|         def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs): | ||||
|             self.type = type | ||||
|             self.original_paste = original_paste | ||||
|             self.super_father = super_father | ||||
|             self.original_item = original_item | ||||
|             self.root_key = None | ||||
|             self.start_urls = url | ||||
|             self.domains = [domain] | ||||
|             date = datetime.datetime.now().strftime("%Y/%m/%d") | ||||
|             self.full_date = datetime.datetime.now().strftime("%Y%m%d") | ||||
|             self.date_month = datetime.datetime.now().strftime("%Y%m") | ||||
|             date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) | ||||
|             self.full_date = date['date_day'] | ||||
|             self.date_month = date['date_month'] | ||||
|             self.date_epoch = int(date['epoch']) | ||||
| 
 | ||||
|             self.arg_crawler = {  'html': crawler_options['html'], | ||||
|                                   'wait': 10, | ||||
|  | @ -97,12 +97,12 @@ class TorSplashCrawler(): | |||
|                 db=self.p.config.getint("ARDB_Onion", "db"), | ||||
|                 decode_responses=True) | ||||
| 
 | ||||
|             self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) | ||||
|             self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) | ||||
| 
 | ||||
|             self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), | ||||
|                                             self.p.config.get("Directories", "crawled"), date ) | ||||
|                                             self.p.config.get("Directories", "crawled"), date_str ) | ||||
| 
 | ||||
|             self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) | ||||
|             self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) | ||||
| 
 | ||||
|         def start_requests(self): | ||||
|             yield SplashRequest( | ||||
|  | @ -110,7 +110,7 @@ class TorSplashCrawler(): | |||
|                 self.parse, | ||||
|                 errback=self.errback_catcher, | ||||
|                 endpoint='render.json', | ||||
|                 meta={'father': self.original_paste, 'root_key': None}, | ||||
|                 meta={'father': self.original_item, 'root_key': None}, | ||||
|                 args=self.arg_crawler | ||||
|             ) | ||||
| 
 | ||||
|  | @ -153,7 +153,7 @@ class TorSplashCrawler(): | |||
|                     if self.root_key is None: | ||||
|                         self.root_key = relative_filename_paste | ||||
|                         # Create/Update crawler history | ||||
|                         self.r_serv_onion.zadd('crawler_history_{}:{}'.format(type_service, domain), int(date['epoch']), self.root_key) | ||||
|                         self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key) | ||||
| 
 | ||||
|                     #create paste metadata | ||||
|                     self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key) | ||||
|  |  | |||
|  | @ -3,16 +3,15 @@ | |||
| 
 | ||||
| import os | ||||
| import sys | ||||
| import json | ||||
| import redis | ||||
| import configparser | ||||
| from TorSplashCrawler import TorSplashCrawler | ||||
| 
 | ||||
| tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' | ||||
| default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50} | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     if len(sys.argv) != 7: | ||||
|         print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father') | ||||
|     if len(sys.argv) != 2: | ||||
|         print('usage:', 'tor_crawler.py', 'uuid') | ||||
|         exit(1) | ||||
| 
 | ||||
|     configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') | ||||
|  | @ -24,36 +23,27 @@ if __name__ == '__main__': | |||
|     cfg = configparser.ConfigParser() | ||||
|     cfg.read(configfile) | ||||
| 
 | ||||
|     splash_url = sys.argv[1] | ||||
|     type = sys.argv[2] | ||||
|     redis_cache = redis.StrictRedis( | ||||
|         host=cfg.get("Redis_Cache", "host"), | ||||
|         port=cfg.getint("Redis_Cache", "port"), | ||||
|         db=cfg.getint("Redis_Cache", "db"), | ||||
|         decode_responses=True) | ||||
| 
 | ||||
|     url = sys.argv[3] | ||||
|     domain = sys.argv[4] | ||||
|     paste = sys.argv[5] | ||||
|     super_father = sys.argv[6] | ||||
|     # get crawler config key | ||||
|     uuid = sys.argv[1] | ||||
| 
 | ||||
|     if crawler_options is None: | ||||
|         crawler_options = default_crawler_options | ||||
|     # get configs | ||||
|     crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid))) | ||||
| 
 | ||||
|     splash_url = crawler_json['splash_url'] | ||||
|     service_type = crawler_json['service_type'] | ||||
|     url = crawler_json['url'] | ||||
|     domain = crawler_json['domain'] | ||||
|     original_item = crawler_json['item'] | ||||
|     crawler_options = crawler_json['crawler_options'] | ||||
|     date = crawler_json['date'] | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     redis_crawler.exists('crawler_option_manual:{}:{}'.format(service_type, domain)): | ||||
| crawler_config['mode_name'] = 'auto' | ||||
|         crawler_config['requested'] = True | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit") | ||||
|     crawler_options['user_agent'] = tor_browser_agent | ||||
|     redis_cache.delete('crawler_request:{}'.format(uuid)) | ||||
| 
 | ||||
|     crawler = TorSplashCrawler(splash_url, crawler_options) | ||||
|     crawler.crawl(type, crawler_options, url, domain, paste, super_father) | ||||
|     crawler.crawl(service_type, crawler_options, date, url, domain, original_item) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Terrtia
						Terrtia