2020-02-03 08:01:22 +01:00
|
|
|
#!/usr/bin/env python3
|
2020-07-17 10:06:06 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
2020-02-03 08:01:22 +01:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
|
2022-10-10 15:51:11 +02:00
|
|
|
import argparse
|
2020-07-17 10:06:06 +02:00
|
|
|
|
|
|
|
|
2022-10-10 15:51:11 +02:00
|
|
|
def process(file, dst, numbers):
|
2020-07-17 10:06:06 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
with open(get_abspath_source_file(file), newline='\n', encoding='utf-8', errors='replace') as csv_file:
|
2022-10-13 08:19:33 +02:00
|
|
|
sites = csv_file.readlines()[1:numbers]
|
2020-07-17 12:42:34 +02:00
|
|
|
|
2020-07-17 10:06:06 +02:00
|
|
|
warninglist = {
|
2022-10-10 15:51:11 +02:00
|
|
|
'name': f'Top {numbers} websites from Majestic Million',
|
2020-07-17 10:06:06 +02:00
|
|
|
'version': get_version(),
|
|
|
|
'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).',
|
|
|
|
'matching_attributes': ['hostname', 'domain'],
|
2020-09-23 21:47:49 +02:00
|
|
|
'type': 'string',
|
2020-07-17 10:06:06 +02:00
|
|
|
'list': []
|
|
|
|
}
|
|
|
|
|
|
|
|
for site in sites:
|
|
|
|
v = site.split(',')[2]
|
|
|
|
warninglist['list'].append(v.rstrip())
|
|
|
|
|
2020-07-21 00:31:06 +02:00
|
|
|
write_to_file(warninglist, dst)
|
2020-07-17 10:06:06 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2022-10-10 15:51:11 +02:00
|
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("-n", help="number of website to process", required=True)
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2020-07-17 10:06:06 +02:00
|
|
|
majestic_url = 'http://downloads.majestic.com/majestic_million.csv'
|
|
|
|
majestic_file = 'majestic_million.csv'
|
|
|
|
majestic_dst = 'majestic_million'
|
|
|
|
|
|
|
|
download_to_file(majestic_url, majestic_file)
|
2022-10-10 15:51:11 +02:00
|
|
|
process(majestic_file, majestic_dst, int(args.n))
|