Create generate_majestic-million.py

Pulls top 10K of the most referred to hosts from Majestic Million.
pull/132/head
GlennHD 2020-02-03 01:01:22 -06:00 committed by GitHub
parent 21643af1bf
commit 109470bf0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 40 additions and 0 deletions

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
import datetime
import logging
import json
import os
import requests
servers_url = 'http://downloads.majestic.com/majestic_million.csv'
csv_path = 'majestic_million.csv'
hostname_path = 'list.json'
if os.path.isfile(csv_path):
logging.warning('Not erasing local csv file')
else:
req = requests.get(servers_url)
with open(csv_path, 'wb') as fd:
for chunk in req.iter_content(4096):
fd.write(chunk)
host_list = []
with open(csv_path, newline='\n', encoding='utf-8', errors='replace') as csv_file:
top10k = csv_file.readlines()[:10000]
version = int(datetime.date.today().strftime('%Y%m%d'))
out_list = {}
out_list['name'] = 'Top 10K websites from Majestic Million'
out_list['version'] = version
out_list['description'] = 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).'
out_list['matching_attributes'] = ['hostname', 'domain']
out_list['type'] = 'hostname'
out_list['list'] = sorted(set(host_list))
for hostname in top10k:
v = hostname.split(',')[2]
out_list['list'].append(v.rstrip())
out_list['list'] = sorted(set(out_list['list']))
with open(hostname_path, 'w', newline='\n') as hostname_file:
hostname_file.write(json.dumps(out_list, indent=2, sort_keys=False))