Create generate_majestic-million.py
Pulls top 10K of the most referred to hosts from Majestic Million.pull/132/head
parent
21643af1bf
commit
109470bf0d
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
|
||||
servers_url = 'http://downloads.majestic.com/majestic_million.csv'
|
||||
csv_path = 'majestic_million.csv'
|
||||
hostname_path = 'list.json'
|
||||
|
||||
if os.path.isfile(csv_path):
|
||||
logging.warning('Not erasing local csv file')
|
||||
else:
|
||||
req = requests.get(servers_url)
|
||||
with open(csv_path, 'wb') as fd:
|
||||
for chunk in req.iter_content(4096):
|
||||
fd.write(chunk)
|
||||
|
||||
host_list = []
|
||||
with open(csv_path, newline='\n', encoding='utf-8', errors='replace') as csv_file:
|
||||
top10k = csv_file.readlines()[:10000]
|
||||
|
||||
version = int(datetime.date.today().strftime('%Y%m%d'))
|
||||
out_list = {}
|
||||
|
||||
out_list['name'] = 'Top 10K websites from Majestic Million'
|
||||
out_list['version'] = version
|
||||
out_list['description'] = 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).'
|
||||
out_list['matching_attributes'] = ['hostname', 'domain']
|
||||
out_list['type'] = 'hostname'
|
||||
out_list['list'] = sorted(set(host_list))
|
||||
|
||||
for hostname in top10k:
|
||||
v = hostname.split(',')[2]
|
||||
out_list['list'].append(v.rstrip())
|
||||
out_list['list'] = sorted(set(out_list['list']))
|
||||
with open(hostname_path, 'w', newline='\n') as hostname_file:
|
||||
hostname_file.write(json.dumps(out_list, indent=2, sort_keys=False))
|
Loading…
Reference in New Issue