From 17952df91e827dfb2a093579217d660c54981c55 Mon Sep 17 00:00:00 2001 From: Trey Darley Date: Fri, 10 Jan 2020 15:27:07 +0100 Subject: [PATCH] add script to generate warning list from Tranco (https://tranco-list.eu/) --- tools/generate_tranco.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 tools/generate_tranco.py diff --git a/tools/generate_tranco.py b/tools/generate_tranco.py new file mode 100755 index 0000000..8399233 --- /dev/null +++ b/tools/generate_tranco.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import requests +import zipfile +import datetime +import json + +tranco_url = 'https://tranco-list.eu/top-1m.csv.zip' +tranco_file = 'top-1m.csv.zip' +user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0'} +r = requests.get(tranco_url, headers=user_agent) +with open(tranco_file, 'wb') as fd: + for chunk in r.iter_content(4096): + fd.write(chunk) +with zipfile.ZipFile(tranco_file, 'r') as tranco_lists: + for name in tranco_lists.namelist(): + if name == 'top-1m.csv': + with tranco_lists.open(name) as tranco: + sites = tranco.readlines() + else: + continue + +tranco_warninglist = {} +version = int(datetime.date.today().strftime('%Y%m%d')) + +tranco_warninglist['description'] = "Event contains one or more entries from the top 1,000,000 most-used sites (Tranco)." +d = datetime.datetime.now() +tranco_warninglist['version'] = version +tranco_warninglist['name'] = "Top 1,000,000 most-used sites from Tranco" +tranco_warninglist['type'] = 'hostname' +tranco_warninglist['list'] = [] +tranco_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip'] + +for site in sites: + v = site.decode('UTF-8').split(',')[1] + tranco_warninglist['list'].append(v.rstrip()) +tranco_warninglist['list'] = sorted(set(tranco_warninglist['list'])) +print(json.dumps(tranco_warninglist))