add .gitignore for downloaded files, refactor code for generators: use central module, remove useless code, fix minor issues

2020-07-17 10:06:06 +02:00 · 2020-07-17 10:06:06 +02:00 · 049475ab22
parent c00b21de5f
commit 049475ab22
9 changed files with 195 additions and 198 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,18 @@
+# files download by generators
+alexa_top-1m.csv.zip
+amazon_ip-ranges.json
+cisco_top-1m.csv.zip
+cloudflare_ips-v4.txt
+cloudflare_ips-v6.txt
+IncludedCACertificateReportPEMCSV.csv
+majestic_million.csv
+ocsp_crl-hostnames.txt.txt
+ocsp_crl-ipv4.txt.txt
+ocsp_crl-ipv6.txt.txt
+ocsp_ocsp-hostnames.txt.txt
+ocsp_ocsp-ipv4.txt.txt
+ocsp_ocsp-ipv6.txt.txt
+PublicAllIntermediateCertsWithPEMCSV.csv
+top500.domains.csv
+top500.pages.csv
+top-1m.csv.zip
--- a/tools/generate-amazon-aws.py
+++ b/tools/generate-amazon-aws.py
@ -1,29 +1,39 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-import json
-import datetime
-import urllib.request
 import json

-res = urllib.request.urlopen('https://ip-ranges.amazonaws.com/ip-ranges.json')
+from generator import download, download_to_file, get_abspath_list_file, get_version

-res_body = res.read()
-j = json.loads(res_body.decode("utf-8"))
-l = []

-for prefix in j['prefixes']:
-   l.append(prefix['ip_prefix'])
+def process(file, dst):
+    with open(file, 'r') as json_file:
+        amazon_aws_ip_list = json.load(json_file)
+    l = []

-for prefix in j['ipv6_prefixes']:
-   l.append(prefix['ipv6_prefix'])
-   
-warninglist = {}
-warninglist['name'] = 'List of known Amazon AWS IP address ranges'
-warninglist['version'] = int(datetime.date.today().strftime('%Y%m%d'))
-warninglist['description'] = 'Amazon AWS IP address ranges (https://ip-ranges.amazonaws.com/ip-ranges.json)'
-warninglist['type'] = 'cidr'
-warninglist['list'] = sorted(set(l))
-warninglist['matching_attributes'] = ["ip-src", "ip-dst", "domain|ip"]
+    for prefix in amazon_aws_ip_list['prefixes']:
+        l.append(prefix['ip_prefix'])

-print(json.dumps(warninglist))
+    for prefix in amazon_aws_ip_list['ipv6_prefixes']:
+        l.append(prefix['ipv6_prefix'])
+
+    warninglist = {}
+    warninglist['name'] = 'List of known Amazon AWS IP address ranges'
+    warninglist['version'] = get_version()
+    warninglist['description'] = 'Amazon AWS IP address ranges (https://ip-ranges.amazonaws.com/ip-ranges.json)'
+    warninglist['type'] = 'cidr'
+    warninglist['list'] = sorted(set(l))
+    warninglist['matching_attributes'] = ["ip-src", "ip-dst", "domain|ip"]
+
+    with open(get_abspath_list_file(dst), 'w') as data_file:
+        json.dump(warninglist, data_file, indent=2, sort_keys=True)
+        data_file.write('\n')
+
+
+if __name__ == '__main__':
+    amazon_url = "https://ip-ranges.amazonaws.com/ip-ranges.json"
+    amazon_file = "amazon_ip-ranges.json"
+    amazon_dst = "amazon-aws"
+
+    download_to_file(amazon_url, amazon_file)
+    process(amazon_file, amazon_dst)
--- a/tools/generate-cisco-top1k.py
+++ b/tools/generate-cisco-top1k.py
@ -1,40 +1,44 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-import requests
 import zipfile
-import datetime
 import json

-cisco_url = "http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"
-cisco_file = "top-1m.csv.zip"
-user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
-r = requests.get(cisco_url, headers=user_agent)
-with open(cisco_file, 'wb') as fd:
-    for chunk in r.iter_content(4096):
-        fd.write(chunk)
-with zipfile.ZipFile(cisco_file, 'r') as cisco_lists:
-    for name in cisco_lists.namelist():
-        if name == "top-1m.csv":
-            with cisco_lists.open(name) as top:
-                top1000 = top.readlines()[:1000]
-        else:
-            continue
-
-cisco_warninglist = {}
-version = int(datetime.date.today().strftime('%Y%m%d'))
-
-cisco_warninglist['description'] = 'Event contains one or more entries from the top 1000 of the most used website (Cisco Umbrella).'
-d = datetime.datetime.now()
-cisco_warninglist['version'] = version
-cisco_warninglist['name'] = 'Top 1000 website from Cisco Umbrella'
-cisco_warninglist['type'] = 'hostname'
-cisco_warninglist['matching_attributes'] = ['hostname', 'domain']
-cisco_warninglist['list'] = []
+from generator import download, download_to_file, get_abspath_list_file, get_version


-for site in top1000:
-    v = str(site).split(',')[1]
-    cisco_warninglist['list'].append(v.rstrip())
-cisco_warninglist['list'] = sorted(set(cisco_warninglist['list']))
-print(json.dumps(cisco_warninglist))
+def process(file, dst):
+    with zipfile.ZipFile(file, 'r') as cisco_lists:
+        for name in cisco_lists.namelist():
+            if name == "top-1m.csv":
+                with cisco_lists.open(name) as top:
+                    top1000 = top.readlines()[:1000]
+            else:
+                continue
+    
+    warninglist = {
+        'description': 'Event contains one or more entries from the top 1000 of the most used website (Cisco Umbrella).',
+        'version': get_version(),
+        'name': 'Top 1000 website from Cisco Umbrella',
+        'type': 'hostname',
+        'matching_attributes': ['hostname', 'domain', 'url', 'domain|ip'],
+        'list': []
+    }
+
+    for site in top1000:
+        v = site.decode('UTF-8').split(',')[1]
+        warninglist['list'].append(v.strip().replace('\\r\\n',''))
+    warninglist['list'] = sorted(set(warninglist['list']))
+    
+    with open(get_abspath_list_file(dst), 'w') as data_file:
+        json.dump(warninglist, data_file, indent=2, sort_keys=True)
+        data_file.write("\n")
+
+
+if __name__ == '__main__':
+    cisco_url = "http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"
+    cisco_file = "cisco_top-1m.csv.zip"
+    cisco_dst = 'cisco_top1000'
+    
+    download_to_file(cisco_url, cisco_file)
+    process(cisco_file, cisco_dst)
--- a/tools/generate-cloudflare.py
+++ b/tools/generate-cloudflare.py
@ -1,33 +1,42 @@
 #!/usr/bin/env python3

 import json
-import os
-import requests
-import datetime
-import io

-base_url="https://www.cloudflare.com/"
-uri_list=['ips-v4','ips-v6']
-dict=dict()
-dict['list']=list()
-def source_read_and_add(input_file):
-	output_list=list()
-
-	for line in input_file.splitlines():
-		output_list.append(line)
-	return output_list
+from generator import download, download_to_file, get_abspath_list_file, get_version


-for uri in uri_list:
-	url = base_url + uri
-	r=requests.get(url)
-	dict['list'] += source_read_and_add(r.text)
+def process(files, dst):
+    warninglist = {}
+    warninglist['name'] = "List of known Cloudflare IP ranges"
+    warninglist['version'] = get_version()
+    warninglist['description'] = "List of known Cloudflare IP ranges (https://www.cloudflare.com/ips/)"
+    warninglist['type'] = "cidr"
+    warninglist['list'] = []
+    warninglist['matching_attributes'] = ["ip-dst","ip-src","domain|ip"]
+    
+    for file in files:
+        with open(file, 'r') as f:
+            ips = f.readlines()
+        for ip in ips:
+            warninglist['list'].append(ip.strip())
+        warninglist['list'] = sorted(set(warninglist['list']))
+    
+    with open(get_abspath_list_file(dst), 'w') as data_file:
+        json.dump(warninglist, data_file, indent=2, sort_keys=True)
+        data_file.write("\n")

-dict['type'] = "cidr"
-dict['matching_attributes']=["ip-dst","ip-src","domain|ip"]
-dict['name']="List of known Cloudflare IP ranges"
-dict['version']= int(datetime.date.today().strftime('%Y%m%d'))
-dict['description']="List of known Cloudflare IP ranges (https://www.cloudflare.com/ips/)"
-dict['list']=list(set(dict['list']))

-print(json.dumps(dict))
+if __name__ == '__main__':
+    cf_base_url = "https://www.cloudflare.com/"
+    uri_list = ['ips-v4','ips-v6']
+    cf_dst = 'cloudflare'
+
+    to_process = list()
+
+    for uri in uri_list:
+        url = cf_base_url+uri
+        file = 'cloudflare_{}.txt'.format(uri)
+        download_to_file(url, file)
+        to_process.append(file)
+    
+    process(to_process, cf_dst)
--- a/tools/generate-covid.py
+++ b/tools/generate-covid.py
@ -1,43 +1,38 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-import requests
 import json
-import datetime

-url = 'https://raw.githubusercontent.com/krassi/covid19-related/master/whitelist-domains.txt'
-r = requests.get(url)
-whitelist = r.text
-whitelist = list(set(whitelist.split()))
-
-warninglist = {
-    'name': 'Covid-19 Krassi\'s Whitelist',
-    'description': 'Krassimir\'s Covid-19 whitelist of known good Covid-19 related websites.',
-    'type': 'hostname',
-    'matching_attributes': ['domain', 'hostname', 'url'],
-    'version': int(datetime.date.today().strftime('%Y%m%d')),
-    'list': sorted(whitelist)
-}
-
-with open('../lists/covid-19-krassi-whitelist/list.json', 'w+') as data_file:
-    json.dump(warninglist, data_file, indent=2, sort_keys=True)
-
-url = 'https://raw.githubusercontent.com/Cyber-Threat-Coalition/goodlist/master/hostnames.txt'
-r = requests.get(url)
-whitelist = r.text
-whitelist = list(set(whitelist.split()))
-
-warninglist = {
-    'name': 'Covid-19 Cyber Threat Coalition\'s Whitelist',
-    'description': 'The Cyber Threat Coalition\'s whitelist of COVID-19 related websites.',
-    'type': 'hostname',
-    'matching_attributes': ['domain', 'hostname', 'url'],
-    'version': int(datetime.date.today().strftime('%Y%m%d')),
-    'list': sorted(whitelist)
-}
-
-with open('../lists/covid-19-cyber-threat-coalition-whitelist/list.json', 'w+') as data_file:
-    json.dump(warninglist, data_file, indent=2, sort_keys=True)
+from generator import download, download_to_file, get_abspath_list_file, get_version


+def process(url, warninglist, dst):
+    whitelist = download(url).text
+    whitelist = list(set(whitelist.split()))

+    warninglist['type'] = 'hostname'
+    warninglist['matching_attributes'] = ['domain', 'hostname', 'url']
+    warninglist['version'] = get_version()
+    warninglist['list'] = sorted(whitelist)
+
+    with open(get_abspath_list_file(dst), 'w') as data_file:
+        json.dump(warninglist, data_file, indent=2, sort_keys=True)
+        data_file.write('\n')
+
+
+if __name__ == '__main__':
+    covid_krassi_url = 'https://raw.githubusercontent.com/krassi/covid19-related/master/whitelist-domains.txt'
+    covid_krassi_dst = 'covid-19-krassi-whitelist'
+    covid_krassi_warninglist = {
+        'name': 'Covid-19 Krassi\'s Whitelist',
+        'description': 'Krassimir\'s Covid-19 whitelist of known good Covid-19 related websites.'
+    }
+    covid_cyber_threat_coalition_url = 'https://raw.githubusercontent.com/Cyber-Threat-Coalition/goodlist/master/hostnames.txt'
+    covid_cyber_threat_coalition_dst = 'covid-19-cyber-threat-coalition-whitelist'
+    covid_cyber_threat_coalition_warninglist = {
+        'name': 'Covid-19 Cyber Threat Coalition\'s Whitelist',
+        'description': 'The Cyber Threat Coalition\'s whitelist of COVID-19 related websites.'
+    }
+    
+    process(covid_krassi_url, covid_krassi_warninglist, covid_krassi_dst)
+    process(covid_cyber_threat_coalition_url, covid_cyber_threat_coalition_warninglist, covid_cyber_threat_coalition_dst)
--- a/tools/generate_alexa.py
+++ b/tools/generate_alexa.py
@ -1,30 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-import datetime
 import json
-from os import path
 import zipfile
-from inspect import currentframe, getframeinfo

-import requests
-
-
-def download(url, file):
-    user_agent = {
-        "User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
-    r = requests.get(url, headers=user_agent)
-    with open(file, 'wb') as fd:
-        for chunk in r.iter_content(4096):
-            fd.write(chunk)
-
-
-def get_abspath_list_file(dst):
-    rel_path = getframeinfo(currentframe()).filename
-    current_folder = path.dirname(path.abspath(rel_path))
-    real_path = path.join(
-        current_folder, '../lists/{dst}/list.json'.format(dst=dst))
-    return path.abspath(path.realpath(real_path))
+from generator import download, download_to_file, get_abspath_list_file, get_version


 def process(file, dst):
@ -36,30 +16,29 @@ def process(file, dst):
            else:
                continue

-    alexa_warninglist = {}
-    alexa_warninglist[
-        'description'] = "Event contains one or more entries from the top 1000 of the most used website (Alexa)."
-    alexa_warninglist['version'] = int(
-        datetime.date.today().strftime('%Y%m%d'))
-    alexa_warninglist['name'] = "Top 1000 website from Alexa"
-    alexa_warninglist['type'] = 'hostname'
-    alexa_warninglist['list'] = []
-    alexa_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
+    warninglist = {
+        'description': "Event contains one or more entries from the top 1000 of the most used website (Alexa).",
+        'version': get_version(),
+        'name': "Top 1000 website from Alexa",
+        'type': 'hostname',
+        'list': [],
+        'matching_attributes': ['hostname', 'domain', 'url', 'domain|ip']
+    }

    for site in top1000:
        v = site.decode('UTF-8').split(',')[1]
-        alexa_warninglist['list'].append(v.rstrip())
-    alexa_warninglist['list'] = sorted(set(alexa_warninglist['list']))
+        warninglist['list'].append(v.rstrip())
+    warninglist['list'] = sorted(set(warninglist['list']))

    with open(get_abspath_list_file(dst), 'w') as data_file:
-        json.dump(alexa_warninglist, data_file, indent=2, sort_keys=True)
+        json.dump(warninglist, data_file, indent=2, sort_keys=True)
        data_file.write("\n")


 if __name__ == "__main__":
    alexa_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
-    alexa_file = "top-1m.csv.zip"
+    alexa_file = "alexa_top-1m.csv.zip"
    alexa_dst = "alexa"

-    download(alexa_url, alexa_file)
+    download_to_file(alexa_url, alexa_file)
    process(alexa_file, alexa_dst)
--- a/tools/generate_majestic-million.py
+++ b/tools/generate_majestic-million.py
@ -1,40 +1,38 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-

-import datetime
-import logging
 import json
-import os
-import requests
+from generator import download, download_to_file, get_abspath_list_file, get_version

-servers_url = 'http://downloads.majestic.com/majestic_million.csv'
-csv_path = 'majestic_million.csv'
-hostname_path = 'list.json'

-if os.path.isfile(csv_path):
-    logging.warning('Not erasing local csv file')
-else:
-    req = requests.get(servers_url)
-    with open(csv_path, 'wb') as fd:
-        for chunk in req.iter_content(4096):
-            fd.write(chunk)
+def process(file, dst):

-host_list = []
-with open(csv_path, newline='\n', encoding='utf-8', errors='replace') as csv_file:
-    top10k = csv_file.readlines()[:10000]
+    with open(file, newline='\n', encoding='utf-8', errors='replace') as csv_file:
+        sites = csv_file.readlines()[:10000]
+    
+    warninglist = {
+        'name': 'Top 10K websites from Majestic Million',
+        'version': get_version(),
+        'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).',
+        'matching_attributes': ['hostname', 'domain'],
+        'type': 'hostname',
+        'list': []
+    }

-version = int(datetime.date.today().strftime('%Y%m%d'))
-out_list = {}
+    for site in sites:
+        v = site.split(',')[2]
+        warninglist['list'].append(v.rstrip())
+    warninglist['list'] = sorted(set(warninglist['list']))

-out_list['name'] = 'Top 10K websites from Majestic Million'
-out_list['version'] = version
-out_list['description'] = 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).'
-out_list['matching_attributes'] = ['hostname', 'domain']
-out_list['type'] = 'hostname'
-out_list['list'] = sorted(set(host_list))
+    with open(get_abspath_list_file(dst), 'w') as data_file:
+        json.dump(warninglist, data_file, indent=2, sort_keys=True)
+        data_file.write("\n")

-for hostname in top10k:
-    v = hostname.split(',')[2]
-    out_list['list'].append(v.rstrip())
-out_list['list'] = sorted(set(out_list['list']))
-with open(hostname_path, 'w', newline='\n') as hostname_file:
-    hostname_file.write(json.dumps(out_list, indent=2, sort_keys=False))
+
+if __name__ == '__main__':
+    majestic_url = 'http://downloads.majestic.com/majestic_million.csv'
+    majestic_file = 'majestic_million.csv'
+    majestic_dst = 'majestic_million'
+
+    download_to_file(majestic_url, majestic_file)
+    process(majestic_file, majestic_dst)    
--- a/tools/generate_mozilla_certificates.py
+++ b/tools/generate_mozilla_certificates.py
@ -4,32 +4,16 @@
 import csv
 import datetime
 import json
-from inspect import currentframe, getframeinfo
-from os import path

-import requests
 from OpenSSL.crypto import FILETYPE_PEM, load_certificate

-
-def download(url, file):
-    r = requests.get(url)
-    with open(file, 'wb') as fd:
-        for chunk in r.iter_content(4096):
-            fd.write(chunk)
+from generator import download, download_to_file, get_abspath_list_file, get_version


 def gethash(cert, digest):
    return cert.digest(digest).decode('ASCII').replace(':', '').lower()


-def get_abspath_list_file(dst):
-    rel_path = getframeinfo(currentframe()).filename
-    current_folder = path.dirname(path.abspath(rel_path))
-    real_path = path.join(
-        current_folder, '../lists/{dst}/list.json'.format(dst=dst))
-    return path.abspath(path.realpath(real_path))
-
-
 def process(file, dst, type):
    hashes = set()
    with open(file, 'r') as f_in:
@ -68,8 +52,8 @@ if __name__ == '__main__':
    CA_known_intermediate_file = 'PublicAllIntermediateCertsWithPEMCSV.csv'
    CA_known_intermediate_dst = 'mozilla-IntermediateCA'

-    download(Included_CA_url, Included_CA_file)
+    download_to_file(Included_CA_url, Included_CA_file)
    process(Included_CA_file, Included_CA_dst, 'trusted CA certificates')
-    download(CA_known_intermediate_url, CA_known_intermediate_file)
+    download_to_file(CA_known_intermediate_url, CA_known_intermediate_file)
    process(CA_known_intermediate_file, CA_known_intermediate_dst,
            'known intermedicate of trusted certificates')
--- a/tools/generate_tranco.py
+++ b/tools/generate_tranco.py
@ -36,7 +36,7 @@ def process(file, warninglist, dst, first_10k=False):

 if __name__ == '__main__':
    tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
-    tranco_file = 'top-1m.csv.zip'
+    tranco_file = 'tranco_top-1m.csv.zip'

    download_to_file(tranco_url, tranco_file)