chg: [majestic_million] numbers parameter

2022-10-10 15:51:11 +02:00 · 2022-10-10 15:51:11 +02:00 · 49922d0635
parent 88d388a01e
commit 49922d0635
2 changed files with 11 additions and 5 deletions
--- a/generate_all.sh
+++ b/generate_all.sh
@ -14,7 +14,7 @@ python3 generate-disposal.py
 # TODO: Google page on Wikipedia does not exist anymore
 # Suggestion came to use a passivetotal whois search for org:Google LLC
 #python3 generate-google.py > lists/google/list.json
-python3 generate_majestic-million.py
+python3 generate_majestic-million.py -n 10000
 python3 generate-microsoft-azure.py
 python3 generate_mozilla_certificates.py
 python3 generate_moz-top500.py
--- a/tools/generate_majestic-million.py
+++ b/tools/generate_majestic-million.py
@ -2,15 +2,16 @@
 # -*- coding: utf-8 -*-

 from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
+import argparse


-def process(file, dst):
+def process(file, dst, numbers):

    with open(get_abspath_source_file(file), newline='\n', encoding='utf-8', errors='replace') as csv_file:
-        sites = csv_file.readlines()[:10000]
+        sites = csv_file.readlines()[:numbers]

    warninglist = {
-        'name': 'Top 10K websites from Majestic Million',
+        'name': f'Top {numbers} websites from Majestic Million',
        'version': get_version(),
        'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).',
        'matching_attributes': ['hostname', 'domain'],
@ -26,9 +27,14 @@ def process(file, dst):


 if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-n", help="number of website to process", required=True)
+    args = parser.parse_args()
+
    majestic_url = 'http://downloads.majestic.com/majestic_million.csv'
    majestic_file = 'majestic_million.csv'
    majestic_dst = 'majestic_million'

    download_to_file(majestic_url, majestic_file)
-    process(majestic_file, majestic_dst)
+    process(majestic_file, majestic_dst, int(args.n))