diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0de077 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +# files download by generators +alexa_top-1m.csv.zip +amazon_ip-ranges.json +cisco_top-1m.csv.zip +cloudflare_ips-v4.txt +cloudflare_ips-v6.txt +IncludedCACertificateReportPEMCSV.csv +majestic_million.csv +ocsp_crl-hostnames.txt.txt +ocsp_crl-ipv4.txt.txt +ocsp_crl-ipv6.txt.txt +ocsp_ocsp-hostnames.txt.txt +ocsp_ocsp-ipv4.txt.txt +ocsp_ocsp-ipv6.txt.txt +PublicAllIntermediateCertsWithPEMCSV.csv +top500.domains.csv +top500.pages.csv +top-1m.csv.zip \ No newline at end of file diff --git a/tools/generate-amazon-aws.py b/tools/generate-amazon-aws.py index 4d2c39b..fa2b835 100755 --- a/tools/generate-amazon-aws.py +++ b/tools/generate-amazon-aws.py @@ -1,29 +1,39 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import json -import datetime -import urllib.request import json -res = urllib.request.urlopen('https://ip-ranges.amazonaws.com/ip-ranges.json') +from generator import download, download_to_file, get_abspath_list_file, get_version -res_body = res.read() -j = json.loads(res_body.decode("utf-8")) -l = [] -for prefix in j['prefixes']: - l.append(prefix['ip_prefix']) +def process(file, dst): + with open(file, 'r') as json_file: + amazon_aws_ip_list = json.load(json_file) + l = [] -for prefix in j['ipv6_prefixes']: - l.append(prefix['ipv6_prefix']) - -warninglist = {} -warninglist['name'] = 'List of known Amazon AWS IP address ranges' -warninglist['version'] = int(datetime.date.today().strftime('%Y%m%d')) -warninglist['description'] = 'Amazon AWS IP address ranges (https://ip-ranges.amazonaws.com/ip-ranges.json)' -warninglist['type'] = 'cidr' -warninglist['list'] = sorted(set(l)) -warninglist['matching_attributes'] = ["ip-src", "ip-dst", "domain|ip"] + for prefix in amazon_aws_ip_list['prefixes']: + l.append(prefix['ip_prefix']) -print(json.dumps(warninglist)) + for prefix in amazon_aws_ip_list['ipv6_prefixes']: + l.append(prefix['ipv6_prefix']) + + warninglist = {} + warninglist['name'] = 'List of known Amazon AWS IP address ranges' + warninglist['version'] = get_version() + warninglist['description'] = 'Amazon AWS IP address ranges (https://ip-ranges.amazonaws.com/ip-ranges.json)' + warninglist['type'] = 'cidr' + warninglist['list'] = sorted(set(l)) + warninglist['matching_attributes'] = ["ip-src", "ip-dst", "domain|ip"] + + with open(get_abspath_list_file(dst), 'w') as data_file: + json.dump(warninglist, data_file, indent=2, sort_keys=True) + data_file.write('\n') + + +if __name__ == '__main__': + amazon_url = "https://ip-ranges.amazonaws.com/ip-ranges.json" + amazon_file = "amazon_ip-ranges.json" + amazon_dst = "amazon-aws" + + download_to_file(amazon_url, amazon_file) + process(amazon_file, amazon_dst) diff --git a/tools/generate-cisco-top1k.py b/tools/generate-cisco-top1k.py index a74204d..95581e8 100755 --- a/tools/generate-cisco-top1k.py +++ b/tools/generate-cisco-top1k.py @@ -1,40 +1,44 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import requests import zipfile -import datetime import json -cisco_url = "http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -cisco_file = "top-1m.csv.zip" -user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"} -r = requests.get(cisco_url, headers=user_agent) -with open(cisco_file, 'wb') as fd: - for chunk in r.iter_content(4096): - fd.write(chunk) -with zipfile.ZipFile(cisco_file, 'r') as cisco_lists: - for name in cisco_lists.namelist(): - if name == "top-1m.csv": - with cisco_lists.open(name) as top: - top1000 = top.readlines()[:1000] - else: - continue - -cisco_warninglist = {} -version = int(datetime.date.today().strftime('%Y%m%d')) - -cisco_warninglist['description'] = 'Event contains one or more entries from the top 1000 of the most used website (Cisco Umbrella).' -d = datetime.datetime.now() -cisco_warninglist['version'] = version -cisco_warninglist['name'] = 'Top 1000 website from Cisco Umbrella' -cisco_warninglist['type'] = 'hostname' -cisco_warninglist['matching_attributes'] = ['hostname', 'domain'] -cisco_warninglist['list'] = [] +from generator import download, download_to_file, get_abspath_list_file, get_version -for site in top1000: - v = str(site).split(',')[1] - cisco_warninglist['list'].append(v.rstrip()) -cisco_warninglist['list'] = sorted(set(cisco_warninglist['list'])) -print(json.dumps(cisco_warninglist)) +def process(file, dst): + with zipfile.ZipFile(file, 'r') as cisco_lists: + for name in cisco_lists.namelist(): + if name == "top-1m.csv": + with cisco_lists.open(name) as top: + top1000 = top.readlines()[:1000] + else: + continue + + warninglist = { + 'description': 'Event contains one or more entries from the top 1000 of the most used website (Cisco Umbrella).', + 'version': get_version(), + 'name': 'Top 1000 website from Cisco Umbrella', + 'type': 'hostname', + 'matching_attributes': ['hostname', 'domain', 'url', 'domain|ip'], + 'list': [] + } + + for site in top1000: + v = site.decode('UTF-8').split(',')[1] + warninglist['list'].append(v.strip().replace('\\r\\n','')) + warninglist['list'] = sorted(set(warninglist['list'])) + + with open(get_abspath_list_file(dst), 'w') as data_file: + json.dump(warninglist, data_file, indent=2, sort_keys=True) + data_file.write("\n") + + +if __name__ == '__main__': + cisco_url = "http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" + cisco_file = "cisco_top-1m.csv.zip" + cisco_dst = 'cisco_top1000' + + download_to_file(cisco_url, cisco_file) + process(cisco_file, cisco_dst) \ No newline at end of file diff --git a/tools/generate-cloudflare.py b/tools/generate-cloudflare.py index 155af9f..3f5a685 100755 --- a/tools/generate-cloudflare.py +++ b/tools/generate-cloudflare.py @@ -1,33 +1,42 @@ #!/usr/bin/env python3 import json -import os -import requests -import datetime -import io -base_url="https://www.cloudflare.com/" -uri_list=['ips-v4','ips-v6'] -dict=dict() -dict['list']=list() -def source_read_and_add(input_file): - output_list=list() - - for line in input_file.splitlines(): - output_list.append(line) - return output_list +from generator import download, download_to_file, get_abspath_list_file, get_version -for uri in uri_list: - url = base_url + uri - r=requests.get(url) - dict['list'] += source_read_and_add(r.text) +def process(files, dst): + warninglist = {} + warninglist['name'] = "List of known Cloudflare IP ranges" + warninglist['version'] = get_version() + warninglist['description'] = "List of known Cloudflare IP ranges (https://www.cloudflare.com/ips/)" + warninglist['type'] = "cidr" + warninglist['list'] = [] + warninglist['matching_attributes'] = ["ip-dst","ip-src","domain|ip"] + + for file in files: + with open(file, 'r') as f: + ips = f.readlines() + for ip in ips: + warninglist['list'].append(ip.strip()) + warninglist['list'] = sorted(set(warninglist['list'])) + + with open(get_abspath_list_file(dst), 'w') as data_file: + json.dump(warninglist, data_file, indent=2, sort_keys=True) + data_file.write("\n") -dict['type'] = "cidr" -dict['matching_attributes']=["ip-dst","ip-src","domain|ip"] -dict['name']="List of known Cloudflare IP ranges" -dict['version']= int(datetime.date.today().strftime('%Y%m%d')) -dict['description']="List of known Cloudflare IP ranges (https://www.cloudflare.com/ips/)" -dict['list']=list(set(dict['list'])) -print(json.dumps(dict)) +if __name__ == '__main__': + cf_base_url = "https://www.cloudflare.com/" + uri_list = ['ips-v4','ips-v6'] + cf_dst = 'cloudflare' + + to_process = list() + + for uri in uri_list: + url = cf_base_url+uri + file = 'cloudflare_{}.txt'.format(uri) + download_to_file(url, file) + to_process.append(file) + + process(to_process, cf_dst) diff --git a/tools/generate-covid.py b/tools/generate-covid.py index 9044615..9f6f9b0 100755 --- a/tools/generate-covid.py +++ b/tools/generate-covid.py @@ -1,43 +1,38 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import requests import json -import datetime -url = 'https://raw.githubusercontent.com/krassi/covid19-related/master/whitelist-domains.txt' -r = requests.get(url) -whitelist = r.text -whitelist = list(set(whitelist.split())) - -warninglist = { - 'name': 'Covid-19 Krassi\'s Whitelist', - 'description': 'Krassimir\'s Covid-19 whitelist of known good Covid-19 related websites.', - 'type': 'hostname', - 'matching_attributes': ['domain', 'hostname', 'url'], - 'version': int(datetime.date.today().strftime('%Y%m%d')), - 'list': sorted(whitelist) -} - -with open('../lists/covid-19-krassi-whitelist/list.json', 'w+') as data_file: - json.dump(warninglist, data_file, indent=2, sort_keys=True) - -url = 'https://raw.githubusercontent.com/Cyber-Threat-Coalition/goodlist/master/hostnames.txt' -r = requests.get(url) -whitelist = r.text -whitelist = list(set(whitelist.split())) - -warninglist = { - 'name': 'Covid-19 Cyber Threat Coalition\'s Whitelist', - 'description': 'The Cyber Threat Coalition\'s whitelist of COVID-19 related websites.', - 'type': 'hostname', - 'matching_attributes': ['domain', 'hostname', 'url'], - 'version': int(datetime.date.today().strftime('%Y%m%d')), - 'list': sorted(whitelist) -} - -with open('../lists/covid-19-cyber-threat-coalition-whitelist/list.json', 'w+') as data_file: - json.dump(warninglist, data_file, indent=2, sort_keys=True) +from generator import download, download_to_file, get_abspath_list_file, get_version +def process(url, warninglist, dst): + whitelist = download(url).text + whitelist = list(set(whitelist.split())) + warninglist['type'] = 'hostname' + warninglist['matching_attributes'] = ['domain', 'hostname', 'url'] + warninglist['version'] = get_version() + warninglist['list'] = sorted(whitelist) + + with open(get_abspath_list_file(dst), 'w') as data_file: + json.dump(warninglist, data_file, indent=2, sort_keys=True) + data_file.write('\n') + + +if __name__ == '__main__': + covid_krassi_url = 'https://raw.githubusercontent.com/krassi/covid19-related/master/whitelist-domains.txt' + covid_krassi_dst = 'covid-19-krassi-whitelist' + covid_krassi_warninglist = { + 'name': 'Covid-19 Krassi\'s Whitelist', + 'description': 'Krassimir\'s Covid-19 whitelist of known good Covid-19 related websites.' + } + covid_cyber_threat_coalition_url = 'https://raw.githubusercontent.com/Cyber-Threat-Coalition/goodlist/master/hostnames.txt' + covid_cyber_threat_coalition_dst = 'covid-19-cyber-threat-coalition-whitelist' + covid_cyber_threat_coalition_warninglist = { + 'name': 'Covid-19 Cyber Threat Coalition\'s Whitelist', + 'description': 'The Cyber Threat Coalition\'s whitelist of COVID-19 related websites.' + } + + process(covid_krassi_url, covid_krassi_warninglist, covid_krassi_dst) + process(covid_cyber_threat_coalition_url, covid_cyber_threat_coalition_warninglist, covid_cyber_threat_coalition_dst) diff --git a/tools/generate_alexa.py b/tools/generate_alexa.py index 91f4add..3269e3f 100755 --- a/tools/generate_alexa.py +++ b/tools/generate_alexa.py @@ -1,30 +1,10 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import datetime import json -from os import path import zipfile -from inspect import currentframe, getframeinfo -import requests - - -def download(url, file): - user_agent = { - "User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"} - r = requests.get(url, headers=user_agent) - with open(file, 'wb') as fd: - for chunk in r.iter_content(4096): - fd.write(chunk) - - -def get_abspath_list_file(dst): - rel_path = getframeinfo(currentframe()).filename - current_folder = path.dirname(path.abspath(rel_path)) - real_path = path.join( - current_folder, '../lists/{dst}/list.json'.format(dst=dst)) - return path.abspath(path.realpath(real_path)) +from generator import download, download_to_file, get_abspath_list_file, get_version def process(file, dst): @@ -36,30 +16,29 @@ def process(file, dst): else: continue - alexa_warninglist = {} - alexa_warninglist[ - 'description'] = "Event contains one or more entries from the top 1000 of the most used website (Alexa)." - alexa_warninglist['version'] = int( - datetime.date.today().strftime('%Y%m%d')) - alexa_warninglist['name'] = "Top 1000 website from Alexa" - alexa_warninglist['type'] = 'hostname' - alexa_warninglist['list'] = [] - alexa_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip'] + warninglist = { + 'description': "Event contains one or more entries from the top 1000 of the most used website (Alexa).", + 'version': get_version(), + 'name': "Top 1000 website from Alexa", + 'type': 'hostname', + 'list': [], + 'matching_attributes': ['hostname', 'domain', 'url', 'domain|ip'] + } for site in top1000: v = site.decode('UTF-8').split(',')[1] - alexa_warninglist['list'].append(v.rstrip()) - alexa_warninglist['list'] = sorted(set(alexa_warninglist['list'])) + warninglist['list'].append(v.rstrip()) + warninglist['list'] = sorted(set(warninglist['list'])) with open(get_abspath_list_file(dst), 'w') as data_file: - json.dump(alexa_warninglist, data_file, indent=2, sort_keys=True) + json.dump(warninglist, data_file, indent=2, sort_keys=True) data_file.write("\n") if __name__ == "__main__": alexa_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" - alexa_file = "top-1m.csv.zip" + alexa_file = "alexa_top-1m.csv.zip" alexa_dst = "alexa" - download(alexa_url, alexa_file) + download_to_file(alexa_url, alexa_file) process(alexa_file, alexa_dst) diff --git a/tools/generate_majestic-million.py b/tools/generate_majestic-million.py index b8ccd00..a44480f 100755 --- a/tools/generate_majestic-million.py +++ b/tools/generate_majestic-million.py @@ -1,40 +1,38 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- -import datetime -import logging import json -import os -import requests +from generator import download, download_to_file, get_abspath_list_file, get_version -servers_url = 'http://downloads.majestic.com/majestic_million.csv' -csv_path = 'majestic_million.csv' -hostname_path = 'list.json' -if os.path.isfile(csv_path): - logging.warning('Not erasing local csv file') -else: - req = requests.get(servers_url) - with open(csv_path, 'wb') as fd: - for chunk in req.iter_content(4096): - fd.write(chunk) +def process(file, dst): -host_list = [] -with open(csv_path, newline='\n', encoding='utf-8', errors='replace') as csv_file: - top10k = csv_file.readlines()[:10000] + with open(file, newline='\n', encoding='utf-8', errors='replace') as csv_file: + sites = csv_file.readlines()[:10000] + + warninglist = { + 'name': 'Top 10K websites from Majestic Million', + 'version': get_version(), + 'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).', + 'matching_attributes': ['hostname', 'domain'], + 'type': 'hostname', + 'list': [] + } -version = int(datetime.date.today().strftime('%Y%m%d')) -out_list = {} + for site in sites: + v = site.split(',')[2] + warninglist['list'].append(v.rstrip()) + warninglist['list'] = sorted(set(warninglist['list'])) -out_list['name'] = 'Top 10K websites from Majestic Million' -out_list['version'] = version -out_list['description'] = 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).' -out_list['matching_attributes'] = ['hostname', 'domain'] -out_list['type'] = 'hostname' -out_list['list'] = sorted(set(host_list)) + with open(get_abspath_list_file(dst), 'w') as data_file: + json.dump(warninglist, data_file, indent=2, sort_keys=True) + data_file.write("\n") -for hostname in top10k: - v = hostname.split(',')[2] - out_list['list'].append(v.rstrip()) -out_list['list'] = sorted(set(out_list['list'])) -with open(hostname_path, 'w', newline='\n') as hostname_file: - hostname_file.write(json.dumps(out_list, indent=2, sort_keys=False)) + +if __name__ == '__main__': + majestic_url = 'http://downloads.majestic.com/majestic_million.csv' + majestic_file = 'majestic_million.csv' + majestic_dst = 'majestic_million' + + download_to_file(majestic_url, majestic_file) + process(majestic_file, majestic_dst) \ No newline at end of file diff --git a/tools/generate_mozilla_certificates.py b/tools/generate_mozilla_certificates.py index fc1a661..45e5412 100755 --- a/tools/generate_mozilla_certificates.py +++ b/tools/generate_mozilla_certificates.py @@ -4,32 +4,16 @@ import csv import datetime import json -from inspect import currentframe, getframeinfo -from os import path -import requests from OpenSSL.crypto import FILETYPE_PEM, load_certificate - -def download(url, file): - r = requests.get(url) - with open(file, 'wb') as fd: - for chunk in r.iter_content(4096): - fd.write(chunk) +from generator import download, download_to_file, get_abspath_list_file, get_version def gethash(cert, digest): return cert.digest(digest).decode('ASCII').replace(':', '').lower() -def get_abspath_list_file(dst): - rel_path = getframeinfo(currentframe()).filename - current_folder = path.dirname(path.abspath(rel_path)) - real_path = path.join( - current_folder, '../lists/{dst}/list.json'.format(dst=dst)) - return path.abspath(path.realpath(real_path)) - - def process(file, dst, type): hashes = set() with open(file, 'r') as f_in: @@ -68,8 +52,8 @@ if __name__ == '__main__': CA_known_intermediate_file = 'PublicAllIntermediateCertsWithPEMCSV.csv' CA_known_intermediate_dst = 'mozilla-IntermediateCA' - download(Included_CA_url, Included_CA_file) + download_to_file(Included_CA_url, Included_CA_file) process(Included_CA_file, Included_CA_dst, 'trusted CA certificates') - download(CA_known_intermediate_url, CA_known_intermediate_file) + download_to_file(CA_known_intermediate_url, CA_known_intermediate_file) process(CA_known_intermediate_file, CA_known_intermediate_dst, 'known intermedicate of trusted certificates') diff --git a/tools/generate_tranco.py b/tools/generate_tranco.py index 8e37f5d..fb76746 100755 --- a/tools/generate_tranco.py +++ b/tools/generate_tranco.py @@ -36,7 +36,7 @@ def process(file, warninglist, dst, first_10k=False): if __name__ == '__main__': tranco_url = 'https://tranco-list.eu/top-1m.csv.zip' - tranco_file = 'top-1m.csv.zip' + tranco_file = 'tranco_top-1m.csv.zip' download_to_file(tranco_url, tranco_file)