misp-warninglists/tools/generate-alexa.py

39 lines
1.3 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import zipfile
import datetime
import json
2016-08-03 15:43:35 +02:00
alexa_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
alexa_file = "top-1m.csv.zip"
2017-02-12 21:01:36 +01:00
user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
r = requests.get(alexa_url, headers=user_agent)
with open(alexa_file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
with zipfile.ZipFile(alexa_file, 'r') as alexa_lists:
for name in alexa_lists.namelist():
if name == "top-1m.csv":
with alexa_lists.open(name) as top:
top1000 = top.readlines()[:1000]
else:
continue
alexa_warninglist = {}
2017-02-12 21:01:36 +01:00
version = int(datetime.date.today().strftime('%Y%m%d'))
alexa_warninglist['description'] = "Event contains one or more entries from the top 1000 of the most used website (Alexa)."
d = datetime.datetime.now()
2017-02-12 21:01:36 +01:00
alexa_warninglist['version'] = version
alexa_warninglist['name'] = "Top 1000 website from Alexa"
alexa_warninglist['list'] = []
2017-02-12 21:01:36 +01:00
alexa_warninglist['matching_attributes'] = ['hostname', 'domain']
for site in top1000:
v = str(site).split(',')[1]
alexa_warninglist['list'].append(v[:-3])
2017-02-12 21:01:36 +01:00
alexa_warninglist['list'] = sorted(set(alexa_warninglist['list']))
print(json.dumps(alexa_warninglist))