Create generate_majestic-million.py
Pulls top 10K of the most referred to hosts from Majestic Million.pull/132/head
							parent
							
								
									21643af1bf
								
							
						
					
					
						commit
						109470bf0d
					
				|  | @ -0,0 +1,40 @@ | |||
| #!/usr/bin/env python3 | ||||
| 
 | ||||
| import datetime | ||||
| import logging | ||||
| import json | ||||
| import os | ||||
| import requests | ||||
| 
 | ||||
| servers_url = 'http://downloads.majestic.com/majestic_million.csv' | ||||
| csv_path = 'majestic_million.csv' | ||||
| hostname_path = 'list.json' | ||||
| 
 | ||||
| if os.path.isfile(csv_path): | ||||
|     logging.warning('Not erasing local csv file') | ||||
| else: | ||||
|     req = requests.get(servers_url) | ||||
|     with open(csv_path, 'wb') as fd: | ||||
|         for chunk in req.iter_content(4096): | ||||
|             fd.write(chunk) | ||||
| 
 | ||||
| host_list = [] | ||||
| with open(csv_path, newline='\n', encoding='utf-8', errors='replace') as csv_file: | ||||
|     top10k = csv_file.readlines()[:10000] | ||||
| 
 | ||||
| version = int(datetime.date.today().strftime('%Y%m%d')) | ||||
| out_list = {} | ||||
| 
 | ||||
| out_list['name'] = 'Top 10K websites from Majestic Million' | ||||
| out_list['version'] = version | ||||
| out_list['description'] = 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).' | ||||
| out_list['matching_attributes'] = ['hostname', 'domain'] | ||||
| out_list['type'] = 'hostname' | ||||
| out_list['list'] = sorted(set(host_list)) | ||||
| 
 | ||||
| for hostname in top10k: | ||||
|     v = hostname.split(',')[2] | ||||
|     out_list['list'].append(v.rstrip()) | ||||
| out_list['list'] = sorted(set(out_list['list'])) | ||||
| with open(hostname_path, 'w', newline='\n') as hostname_file: | ||||
|     hostname_file.write(json.dumps(out_list, indent=2, sort_keys=False)) | ||||
		Loading…
	
		Reference in New Issue
	
	 GlennHD
						GlennHD