misp-warninglists/tools/generate_majestic-million.py

40 lines
1.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
2020-07-17 12:42:34 +02:00
from generator import download_to_file, get_abspath_list_file, get_version
def process(file, dst):
with open(file, newline='\n', encoding='utf-8', errors='replace') as csv_file:
sites = csv_file.readlines()[:10000]
2020-07-17 12:42:34 +02:00
warninglist = {
'name': 'Top 10K websites from Majestic Million',
'version': get_version(),
'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).',
'matching_attributes': ['hostname', 'domain'],
'type': 'hostname',
'list': []
}
for site in sites:
v = site.split(',')[2]
warninglist['list'].append(v.rstrip())
warninglist['list'] = sorted(set(warninglist['list']))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
if __name__ == '__main__':
majestic_url = 'http://downloads.majestic.com/majestic_million.csv'
majestic_file = 'majestic_million.csv'
majestic_dst = 'majestic_million'
download_to_file(majestic_url, majestic_file)
2020-07-17 12:42:34 +02:00
process(majestic_file, majestic_dst)