diff --git a/generate_all.sh b/generate_all.sh index e2b74e7..1fe3fe3 100755 --- a/generate_all.sh +++ b/generate_all.sh @@ -14,7 +14,7 @@ python3 generate-disposal.py # TODO: Google page on Wikipedia does not exist anymore # Suggestion came to use a passivetotal whois search for org:Google LLC #python3 generate-google.py > lists/google/list.json -python3 generate_majestic-million.py +python3 generate_majestic-million.py -n 10000 python3 generate-microsoft-azure.py python3 generate_mozilla_certificates.py python3 generate_moz-top500.py diff --git a/tools/generate_majestic-million.py b/tools/generate_majestic-million.py index 9bb6506..ffd25a4 100755 --- a/tools/generate_majestic-million.py +++ b/tools/generate_majestic-million.py @@ -2,15 +2,16 @@ # -*- coding: utf-8 -*- from generator import download_to_file, get_version, write_to_file, get_abspath_source_file +import argparse -def process(file, dst): +def process(file, dst, numbers): with open(get_abspath_source_file(file), newline='\n', encoding='utf-8', errors='replace') as csv_file: - sites = csv_file.readlines()[:10000] + sites = csv_file.readlines()[:numbers] warninglist = { - 'name': 'Top 10K websites from Majestic Million', + 'name': f'Top {numbers} websites from Majestic Million', 'version': get_version(), 'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).', 'matching_attributes': ['hostname', 'domain'], @@ -26,9 +27,14 @@ def process(file, dst): if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument("-n", help="number of website to process", required=True) + args = parser.parse_args() + majestic_url = 'http://downloads.majestic.com/majestic_million.csv' majestic_file = 'majestic_million.csv' majestic_dst = 'majestic_million' download_to_file(majestic_url, majestic_file) - process(majestic_file, majestic_dst) + process(majestic_file, majestic_dst, int(args.n))