add .gitignore for downloaded files, refactor code for generators: use central module, remove useless code, fix minor issues

pull/154/head
Kevin Holvoet 2020-07-17 10:06:06 +02:00
parent c00b21de5f
commit 049475ab22
9 changed files with 195 additions and 198 deletions

18
.gitignore vendored Normal file
View File

@ -0,0 +1,18 @@
# files download by generators
alexa_top-1m.csv.zip
amazon_ip-ranges.json
cisco_top-1m.csv.zip
cloudflare_ips-v4.txt
cloudflare_ips-v6.txt
IncludedCACertificateReportPEMCSV.csv
majestic_million.csv
ocsp_crl-hostnames.txt.txt
ocsp_crl-ipv4.txt.txt
ocsp_crl-ipv6.txt.txt
ocsp_ocsp-hostnames.txt.txt
ocsp_ocsp-ipv4.txt.txt
ocsp_ocsp-ipv6.txt.txt
PublicAllIntermediateCertsWithPEMCSV.csv
top500.domains.csv
top500.pages.csv
top-1m.csv.zip

View File

@ -1,29 +1,39 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import datetime
import urllib.request
import json
res = urllib.request.urlopen('https://ip-ranges.amazonaws.com/ip-ranges.json')
from generator import download, download_to_file, get_abspath_list_file, get_version
res_body = res.read()
j = json.loads(res_body.decode("utf-8"))
l = []
for prefix in j['prefixes']:
l.append(prefix['ip_prefix'])
def process(file, dst):
with open(file, 'r') as json_file:
amazon_aws_ip_list = json.load(json_file)
l = []
for prefix in j['ipv6_prefixes']:
l.append(prefix['ipv6_prefix'])
warninglist = {}
warninglist['name'] = 'List of known Amazon AWS IP address ranges'
warninglist['version'] = int(datetime.date.today().strftime('%Y%m%d'))
warninglist['description'] = 'Amazon AWS IP address ranges (https://ip-ranges.amazonaws.com/ip-ranges.json)'
warninglist['type'] = 'cidr'
warninglist['list'] = sorted(set(l))
warninglist['matching_attributes'] = ["ip-src", "ip-dst", "domain|ip"]
for prefix in amazon_aws_ip_list['prefixes']:
l.append(prefix['ip_prefix'])
print(json.dumps(warninglist))
for prefix in amazon_aws_ip_list['ipv6_prefixes']:
l.append(prefix['ipv6_prefix'])
warninglist = {}
warninglist['name'] = 'List of known Amazon AWS IP address ranges'
warninglist['version'] = get_version()
warninglist['description'] = 'Amazon AWS IP address ranges (https://ip-ranges.amazonaws.com/ip-ranges.json)'
warninglist['type'] = 'cidr'
warninglist['list'] = sorted(set(l))
warninglist['matching_attributes'] = ["ip-src", "ip-dst", "domain|ip"]
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write('\n')
if __name__ == '__main__':
amazon_url = "https://ip-ranges.amazonaws.com/ip-ranges.json"
amazon_file = "amazon_ip-ranges.json"
amazon_dst = "amazon-aws"
download_to_file(amazon_url, amazon_file)
process(amazon_file, amazon_dst)

View File

@ -1,40 +1,44 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import zipfile
import datetime
import json
cisco_url = "http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"
cisco_file = "top-1m.csv.zip"
user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
r = requests.get(cisco_url, headers=user_agent)
with open(cisco_file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
with zipfile.ZipFile(cisco_file, 'r') as cisco_lists:
for name in cisco_lists.namelist():
if name == "top-1m.csv":
with cisco_lists.open(name) as top:
top1000 = top.readlines()[:1000]
else:
continue
cisco_warninglist = {}
version = int(datetime.date.today().strftime('%Y%m%d'))
cisco_warninglist['description'] = 'Event contains one or more entries from the top 1000 of the most used website (Cisco Umbrella).'
d = datetime.datetime.now()
cisco_warninglist['version'] = version
cisco_warninglist['name'] = 'Top 1000 website from Cisco Umbrella'
cisco_warninglist['type'] = 'hostname'
cisco_warninglist['matching_attributes'] = ['hostname', 'domain']
cisco_warninglist['list'] = []
from generator import download, download_to_file, get_abspath_list_file, get_version
for site in top1000:
v = str(site).split(',')[1]
cisco_warninglist['list'].append(v.rstrip())
cisco_warninglist['list'] = sorted(set(cisco_warninglist['list']))
print(json.dumps(cisco_warninglist))
def process(file, dst):
with zipfile.ZipFile(file, 'r') as cisco_lists:
for name in cisco_lists.namelist():
if name == "top-1m.csv":
with cisco_lists.open(name) as top:
top1000 = top.readlines()[:1000]
else:
continue
warninglist = {
'description': 'Event contains one or more entries from the top 1000 of the most used website (Cisco Umbrella).',
'version': get_version(),
'name': 'Top 1000 website from Cisco Umbrella',
'type': 'hostname',
'matching_attributes': ['hostname', 'domain', 'url', 'domain|ip'],
'list': []
}
for site in top1000:
v = site.decode('UTF-8').split(',')[1]
warninglist['list'].append(v.strip().replace('\\r\\n',''))
warninglist['list'] = sorted(set(warninglist['list']))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
if __name__ == '__main__':
cisco_url = "http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip"
cisco_file = "cisco_top-1m.csv.zip"
cisco_dst = 'cisco_top1000'
download_to_file(cisco_url, cisco_file)
process(cisco_file, cisco_dst)

View File

@ -1,33 +1,42 @@
#!/usr/bin/env python3
import json
import os
import requests
import datetime
import io
base_url="https://www.cloudflare.com/"
uri_list=['ips-v4','ips-v6']
dict=dict()
dict['list']=list()
def source_read_and_add(input_file):
output_list=list()
for line in input_file.splitlines():
output_list.append(line)
return output_list
from generator import download, download_to_file, get_abspath_list_file, get_version
for uri in uri_list:
url = base_url + uri
r=requests.get(url)
dict['list'] += source_read_and_add(r.text)
def process(files, dst):
warninglist = {}
warninglist['name'] = "List of known Cloudflare IP ranges"
warninglist['version'] = get_version()
warninglist['description'] = "List of known Cloudflare IP ranges (https://www.cloudflare.com/ips/)"
warninglist['type'] = "cidr"
warninglist['list'] = []
warninglist['matching_attributes'] = ["ip-dst","ip-src","domain|ip"]
for file in files:
with open(file, 'r') as f:
ips = f.readlines()
for ip in ips:
warninglist['list'].append(ip.strip())
warninglist['list'] = sorted(set(warninglist['list']))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
dict['type'] = "cidr"
dict['matching_attributes']=["ip-dst","ip-src","domain|ip"]
dict['name']="List of known Cloudflare IP ranges"
dict['version']= int(datetime.date.today().strftime('%Y%m%d'))
dict['description']="List of known Cloudflare IP ranges (https://www.cloudflare.com/ips/)"
dict['list']=list(set(dict['list']))
print(json.dumps(dict))
if __name__ == '__main__':
cf_base_url = "https://www.cloudflare.com/"
uri_list = ['ips-v4','ips-v6']
cf_dst = 'cloudflare'
to_process = list()
for uri in uri_list:
url = cf_base_url+uri
file = 'cloudflare_{}.txt'.format(uri)
download_to_file(url, file)
to_process.append(file)
process(to_process, cf_dst)

View File

@ -1,43 +1,38 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import json
import datetime
url = 'https://raw.githubusercontent.com/krassi/covid19-related/master/whitelist-domains.txt'
r = requests.get(url)
whitelist = r.text
whitelist = list(set(whitelist.split()))
warninglist = {
'name': 'Covid-19 Krassi\'s Whitelist',
'description': 'Krassimir\'s Covid-19 whitelist of known good Covid-19 related websites.',
'type': 'hostname',
'matching_attributes': ['domain', 'hostname', 'url'],
'version': int(datetime.date.today().strftime('%Y%m%d')),
'list': sorted(whitelist)
}
with open('../lists/covid-19-krassi-whitelist/list.json', 'w+') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
url = 'https://raw.githubusercontent.com/Cyber-Threat-Coalition/goodlist/master/hostnames.txt'
r = requests.get(url)
whitelist = r.text
whitelist = list(set(whitelist.split()))
warninglist = {
'name': 'Covid-19 Cyber Threat Coalition\'s Whitelist',
'description': 'The Cyber Threat Coalition\'s whitelist of COVID-19 related websites.',
'type': 'hostname',
'matching_attributes': ['domain', 'hostname', 'url'],
'version': int(datetime.date.today().strftime('%Y%m%d')),
'list': sorted(whitelist)
}
with open('../lists/covid-19-cyber-threat-coalition-whitelist/list.json', 'w+') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
from generator import download, download_to_file, get_abspath_list_file, get_version
def process(url, warninglist, dst):
whitelist = download(url).text
whitelist = list(set(whitelist.split()))
warninglist['type'] = 'hostname'
warninglist['matching_attributes'] = ['domain', 'hostname', 'url']
warninglist['version'] = get_version()
warninglist['list'] = sorted(whitelist)
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write('\n')
if __name__ == '__main__':
covid_krassi_url = 'https://raw.githubusercontent.com/krassi/covid19-related/master/whitelist-domains.txt'
covid_krassi_dst = 'covid-19-krassi-whitelist'
covid_krassi_warninglist = {
'name': 'Covid-19 Krassi\'s Whitelist',
'description': 'Krassimir\'s Covid-19 whitelist of known good Covid-19 related websites.'
}
covid_cyber_threat_coalition_url = 'https://raw.githubusercontent.com/Cyber-Threat-Coalition/goodlist/master/hostnames.txt'
covid_cyber_threat_coalition_dst = 'covid-19-cyber-threat-coalition-whitelist'
covid_cyber_threat_coalition_warninglist = {
'name': 'Covid-19 Cyber Threat Coalition\'s Whitelist',
'description': 'The Cyber Threat Coalition\'s whitelist of COVID-19 related websites.'
}
process(covid_krassi_url, covid_krassi_warninglist, covid_krassi_dst)
process(covid_cyber_threat_coalition_url, covid_cyber_threat_coalition_warninglist, covid_cyber_threat_coalition_dst)

View File

@ -1,30 +1,10 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import datetime
import json
from os import path
import zipfile
from inspect import currentframe, getframeinfo
import requests
def download(url, file):
user_agent = {
"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
r = requests.get(url, headers=user_agent)
with open(file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
def get_abspath_list_file(dst):
rel_path = getframeinfo(currentframe()).filename
current_folder = path.dirname(path.abspath(rel_path))
real_path = path.join(
current_folder, '../lists/{dst}/list.json'.format(dst=dst))
return path.abspath(path.realpath(real_path))
from generator import download, download_to_file, get_abspath_list_file, get_version
def process(file, dst):
@ -36,30 +16,29 @@ def process(file, dst):
else:
continue
alexa_warninglist = {}
alexa_warninglist[
'description'] = "Event contains one or more entries from the top 1000 of the most used website (Alexa)."
alexa_warninglist['version'] = int(
datetime.date.today().strftime('%Y%m%d'))
alexa_warninglist['name'] = "Top 1000 website from Alexa"
alexa_warninglist['type'] = 'hostname'
alexa_warninglist['list'] = []
alexa_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
warninglist = {
'description': "Event contains one or more entries from the top 1000 of the most used website (Alexa).",
'version': get_version(),
'name': "Top 1000 website from Alexa",
'type': 'hostname',
'list': [],
'matching_attributes': ['hostname', 'domain', 'url', 'domain|ip']
}
for site in top1000:
v = site.decode('UTF-8').split(',')[1]
alexa_warninglist['list'].append(v.rstrip())
alexa_warninglist['list'] = sorted(set(alexa_warninglist['list']))
warninglist['list'].append(v.rstrip())
warninglist['list'] = sorted(set(warninglist['list']))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(alexa_warninglist, data_file, indent=2, sort_keys=True)
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
if __name__ == "__main__":
alexa_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
alexa_file = "top-1m.csv.zip"
alexa_file = "alexa_top-1m.csv.zip"
alexa_dst = "alexa"
download(alexa_url, alexa_file)
download_to_file(alexa_url, alexa_file)
process(alexa_file, alexa_dst)

View File

@ -1,40 +1,38 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import datetime
import logging
import json
import os
import requests
from generator import download, download_to_file, get_abspath_list_file, get_version
servers_url = 'http://downloads.majestic.com/majestic_million.csv'
csv_path = 'majestic_million.csv'
hostname_path = 'list.json'
if os.path.isfile(csv_path):
logging.warning('Not erasing local csv file')
else:
req = requests.get(servers_url)
with open(csv_path, 'wb') as fd:
for chunk in req.iter_content(4096):
fd.write(chunk)
def process(file, dst):
host_list = []
with open(csv_path, newline='\n', encoding='utf-8', errors='replace') as csv_file:
top10k = csv_file.readlines()[:10000]
with open(file, newline='\n', encoding='utf-8', errors='replace') as csv_file:
sites = csv_file.readlines()[:10000]
warninglist = {
'name': 'Top 10K websites from Majestic Million',
'version': get_version(),
'description': 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).',
'matching_attributes': ['hostname', 'domain'],
'type': 'hostname',
'list': []
}
version = int(datetime.date.today().strftime('%Y%m%d'))
out_list = {}
for site in sites:
v = site.split(',')[2]
warninglist['list'].append(v.rstrip())
warninglist['list'] = sorted(set(warninglist['list']))
out_list['name'] = 'Top 10K websites from Majestic Million'
out_list['version'] = version
out_list['description'] = 'Event contains one or more entries from the top 10K of the most used websites (Majestic Million).'
out_list['matching_attributes'] = ['hostname', 'domain']
out_list['type'] = 'hostname'
out_list['list'] = sorted(set(host_list))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
for hostname in top10k:
v = hostname.split(',')[2]
out_list['list'].append(v.rstrip())
out_list['list'] = sorted(set(out_list['list']))
with open(hostname_path, 'w', newline='\n') as hostname_file:
hostname_file.write(json.dumps(out_list, indent=2, sort_keys=False))
if __name__ == '__main__':
majestic_url = 'http://downloads.majestic.com/majestic_million.csv'
majestic_file = 'majestic_million.csv'
majestic_dst = 'majestic_million'
download_to_file(majestic_url, majestic_file)
process(majestic_file, majestic_dst)

View File

@ -4,32 +4,16 @@
import csv
import datetime
import json
from inspect import currentframe, getframeinfo
from os import path
import requests
from OpenSSL.crypto import FILETYPE_PEM, load_certificate
def download(url, file):
r = requests.get(url)
with open(file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
from generator import download, download_to_file, get_abspath_list_file, get_version
def gethash(cert, digest):
return cert.digest(digest).decode('ASCII').replace(':', '').lower()
def get_abspath_list_file(dst):
rel_path = getframeinfo(currentframe()).filename
current_folder = path.dirname(path.abspath(rel_path))
real_path = path.join(
current_folder, '../lists/{dst}/list.json'.format(dst=dst))
return path.abspath(path.realpath(real_path))
def process(file, dst, type):
hashes = set()
with open(file, 'r') as f_in:
@ -68,8 +52,8 @@ if __name__ == '__main__':
CA_known_intermediate_file = 'PublicAllIntermediateCertsWithPEMCSV.csv'
CA_known_intermediate_dst = 'mozilla-IntermediateCA'
download(Included_CA_url, Included_CA_file)
download_to_file(Included_CA_url, Included_CA_file)
process(Included_CA_file, Included_CA_dst, 'trusted CA certificates')
download(CA_known_intermediate_url, CA_known_intermediate_file)
download_to_file(CA_known_intermediate_url, CA_known_intermediate_file)
process(CA_known_intermediate_file, CA_known_intermediate_dst,
'known intermedicate of trusted certificates')

View File

@ -36,7 +36,7 @@ def process(file, warninglist, dst, first_10k=False):
if __name__ == '__main__':
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
tranco_file = 'top-1m.csv.zip'
tranco_file = 'tranco_top-1m.csv.zip'
download_to_file(tranco_url, tranco_file)