url-abuse/url_abuse_async.py

492 lines
14 KiB
Python

#!/usr/bin/env python
#
#
# Copyright (C) 2014 Sascha Rommelfangen, Raphael Vinot
# Copyright (C) 2014 CIRCL Computer Incident Response Center Luxembourg (SMILE gie)
#
from datetime import date
import json
import redis
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
from pyfaup.faup import Faup
import socket
import dns.resolver
import re
import sys
import logging
from pypdns import PyPDNS
import bgpranking_web
import urlquery
from pypssl import PyPSSL
import requests
from bs4 import BeautifulSoup
try:
import sphinxapi
sphinx = True
except:
sphinx = False
enable_cache = True
r_cache = None
def _cache_init(host='localhost', port=6334, db=1):
global r_cache
if enable_cache and r_cache is None:
r_cache = redis.Redis(host, port, db=db)
def _cache_set(key, value, field=None):
_cache_init()
if enable_cache:
if field is None:
r_cache.setex(key, json.dumps(value), 3600)
else:
r_cache.hset(key, field, json.dumps(value))
r_cache.expire(key, 3600)
def _cache_get(key, field=None):
_cache_init()
if enable_cache:
if field is None:
value_json = r_cache.get(key)
else:
value_json = r_cache.hget(key, field)
if value_json is not None:
return json.loads(value_json)
return None
def to_bool(s):
"""
Converts the given string to a boolean.
"""
return s.lower() in ('1', 'true', 'yes', 'on')
def get_submissions(url, day=None):
_cache_init()
if enable_cache:
if day is None:
day = date.today().isoformat()
else:
day = day.isoformat()
key = date.today().isoformat() + '_submissions'
return r_cache.zscore(key, url)
def get_mail_sent(url, day=None):
_cache_init()
if enable_cache:
if day is None:
day = date.today().isoformat()
else:
day = day.isoformat()
key = date.today().isoformat() + '_mails'
return r_cache.sismember(key, url)
def set_mail_sent(url, day=None):
_cache_init()
if enable_cache:
if day is None:
day = date.today().isoformat()
else:
day = day.isoformat()
key = date.today().isoformat() + '_mails'
return r_cache.sadd(key, url)
def is_valid_url(url):
cached = _cache_get(url, 'valid')
key = date.today().isoformat() + '_submissions'
r_cache.zincrby(key, url)
if cached is not None:
return cached
fex = Faup()
if url.startswith('hxxp'):
url = 'http' + url[4:]
elif not url.startswith('http'):
url = 'http://' + url
logging.debug("Checking validity of URL: " + url)
fex.decode(url)
scheme = fex.get_scheme()
host = fex.get_host()
if scheme is None or host is None:
reason = "Not a valid http/https URL/URI"
return False, url, reason
_cache_set(url, (True, url, None), 'valid')
return True, url, None
def is_ip(host):
if ':' in host:
try:
socket.inet_pton(socket.AF_INET6, host)
return True
except:
pass
else:
try:
socket.inet_aton(host)
return True
except:
pass
return False
def try_resolve(fex, url):
fex.decode(url)
host = fex.get_host().lower()
if is_ip(host):
return True, None
try:
ipaddr = dns.resolver.query(host, 'A')
except Exception:
reason = "DNS server problem. Check resolver settings."
return False, reason
if not ipaddr:
reason = "Host " + host + " does not exist."
return False, reason
return True, None
def get_urls(url, depth=1):
if depth > 5:
print('Too many redirects.')
return
fex = Faup()
def meta_redirect(content):
c = content.lower()
soup = BeautifulSoup(c)
for result in soup.find_all(attrs={'http-equiv': 'refresh'}):
if result:
out = result["content"].split(";")
if len(out) == 2:
wait, text = out
a, url = text.split('=', 1)
return url.strip()
return None
resolve, reason = try_resolve(fex, url)
if not resolve:
# FIXME: inform that the domain does not resolve
yield url
return
logging.debug("Making HTTP connection to " + url)
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'}
try:
response = requests.get(url, allow_redirects=True, headers=headers,
timeout=15, verify=False)
except:
# That one can fail (DNS for example)
# FIXME: inform that the get failed
yield url
return
if response.history is not None:
for h in response.history:
# Yeld the urls in the order we find them
yield h.url
yield response.url
meta_redir_url = meta_redirect(response.content)
if meta_redir_url is not None:
depth += 1
if not meta_redir_url.startswith('http'):
fex.decode(url)
base = '{}://{}'.format(fex.get_scheme(), fex.get_host())
port = fex.get_port()
if port is not None:
base += ':{}'.format(port)
if not meta_redir_url.startswith('/'):
# relative redirect. resource_path has the initial '/'
if fex.get_resource_path() is not None:
base += fex.get_resource_path()
if not base.endswith('/'):
base += '/'
meta_redir_url = base + meta_redir_url
for url in get_urls(meta_redir_url, depth):
yield url
def url_list(url):
cached = _cache_get(url, 'list')
if cached is not None:
return cached
list_urls = []
for u in get_urls(url):
if u is None or u in list_urls:
continue
list_urls.append(u)
_cache_set(url, list_urls, 'list')
return list_urls
def dns_resolve(url):
cached = _cache_get(url, 'dns')
if cached is not None:
return cached
fex = Faup()
fex.decode(url)
host = fex.get_host().lower()
ipv4 = None
ipv6 = None
if not is_ip(host):
try:
ipv4 = [str(ip) for ip in dns.resolver.query(host, 'A')]
except:
logging.debug("No IPv4 address assigned to: " + host)
try:
ipv6 = [str(ip) for ip in dns.resolver.query(host, 'AAAA')]
except:
logging.debug("No IPv6 address assigned to: " + host)
_cache_set(url, (ipv4, ipv6), 'dns')
return ipv4, ipv6
def phish_query(url, key, query):
cached = _cache_get(query, 'phishtank')
if cached is not None:
return cached
postfields = {'url': quote(query), 'format': 'json', 'app_key': key}
response = requests.post(url, data=postfields)
res = response.json()
if res["meta"]["status"] == "success":
if res["results"]["in_database"]:
_cache_set(query, res["results"]["phish_detail_page"], 'phishtank')
return res["results"]["phish_detail_page"]
else:
# no information
pass
elif res["meta"]["status"] == 'error':
# Inform the user?
# errormsg = res["errortext"]
pass
return None
def sphinxsearch(server, port, url, query):
if not sphinx:
return None
cached = _cache_get(query, 'sphinx')
if cached is not None:
return cached
client = sphinxapi.SphinxClient()
client.SetServer(server, port)
client.SetMatchMode(2)
client.SetConnectTimeout(5.0)
result = []
res = client.Query(query)
if res.get("matches") is not None:
for ticket in res["matches"]:
ticket_id = ticket["id"]
ticket_link = url + str(ticket_id)
result.append(ticket_link)
_cache_set(query, result, 'sphinx')
return result
def vt_query_url(url, url_up, key, query, upload=True):
cached = _cache_get(query, 'vt')
if cached is not None:
return cached
parameters = {"resource": query, "apikey": key}
if upload:
parameters['scan'] = 1
response = requests.post(url, data=parameters)
if response.text is None or len(response.text) == 0:
return None
res = response.json()
msg = res["verbose_msg"]
link = res.get("permalink")
positives = res.get("positives")
total = res.get("total")
if positives is not None:
_cache_set(query, (msg, link, positives, total), 'vt')
return msg, link, positives, total
def gsb_query(url, query):
cached = _cache_get(query, 'gsb')
if cached is not None:
return cached
param = '1\n' + query
response = requests.post(url, data=param)
status = response.status_code
if status == 200:
_cache_set(query, response.text, 'gsb')
return response.text
def urlquery_query(url, key, query):
cached = _cache_get(query, 'urlquery')
if cached is not None:
return cached
try:
urlquery.url = url
urlquery.key = key
response = urlquery.search(query)
except:
return None
if response['_response_']['status'] == 'ok':
if response.get('reports') is not None:
total_alert_count = 0
for r in response['reports']:
total_alert_count += r['urlquery_alert_count']
total_alert_count += r['ids_alert_count']
total_alert_count += r['blacklist_alert_count']
_cache_set(query, total_alert_count, 'urlquery')
return total_alert_count
else:
return None
def process_emails(emails, ignorelist, replacelist):
to_return = list(set(emails))
for mail in reversed(to_return):
for ignorelist_entry in ignorelist:
if re.search(ignorelist_entry, mail, re.I):
if mail in to_return:
to_return.remove(mail)
for k, v in list(replacelist.items()):
if re.search(k, mail, re.I):
if k in to_return:
to_return.remove(k)
to_return += v
return to_return
def whois(server, port, domain, ignorelist, replacelist):
cached = _cache_get(domain, 'whois')
if cached is not None:
return cached
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(15)
try:
s.connect((server, port))
except Exception:
print("Connection problems - check WHOIS server")
print(("WHOIS request while problem occurred: ", domain))
print(("WHOIS server: {}:{}".format(server, port)))
sys.exit(0)
if domain.startswith('http'):
fex = Faup()
fex.decode(domain)
d = fex.get_domain().lower()
else:
d = domain
s.send(d + "\r\n")
response = ''
while True:
d = s.recv(4096)
response += d
if d == '':
break
s.close()
match = re.findall(r'[\w\.-]+@[\w\.-]+', response)
emails = process_emails(match, ignorelist, replacelist)
if len(emails) == 0:
return None
list_mail = list(set(emails))
_cache_set(domain, list_mail, 'whois')
return list_mail
def pdnscircl(url, user, passwd, q):
cached = _cache_get(q, 'pdns')
if cached is not None:
return cached
pdnscircl = PyPDNS(url, basic_auth=(user, passwd))
response = pdnscircl.query(q)
all_uniq = []
for e in reversed(response):
host = e['rrname'].lower()
if host in all_uniq:
continue
else:
all_uniq.append(host)
response = (len(all_uniq), all_uniq[:5])
_cache_set(q, response, 'pdns')
return response
def psslcircl(url, user, passwd, q):
cached = _cache_get(q, 'pssl')
if cached is not None:
return cached
psslcircl = PyPSSL(url, basic_auth=(user, passwd))
response = psslcircl.query(q)
if response.get(q) is not None:
entries = response[q]
_cache_set(q, entries, 'pssl')
return entries
return None
def bgpranking(ip):
cached = _cache_get(ip, 'bgp')
if cached is not None:
return cached
details = bgpranking_web.ip_lookup(ip, 7)
ptrr = details.get('ptrrecord')
if details.get('history') is None or len(details.get('history')) == 0:
return ptrr, None, None, None, None, None
asn = details['history'][0].get('asn')
rank_info = bgpranking_web.cached_daily_rank(asn)
position, total = bgpranking_web.cached_position(asn)
asn_descr = rank_info[1]
rank = rank_info[-1]
response = (ptrr, asn_descr, asn, int(position), int(total), float(rank))
_cache_set(ip, response, 'bgp')
return response
def _deserialize_cached(entry):
to_return = {}
h = r_cache.hgetall(entry)
for key, value in list(h.items()):
to_return[key] = json.loads(value)
return to_return
def get_url_data(url):
data = _deserialize_cached(url)
if data.get('dns') is not None:
ipv4, ipv6 = data['dns']
ip_data = {}
if ipv4 is not None:
for ip in ipv4:
ip_data[ip] = _deserialize_cached(ip)
if ipv6 is not None:
for ip in ipv6:
ip_data[ip] = _deserialize_cached(ip)
if len(ip_data) > 0:
data.update(ip_data)
return {url: data}
def cached(url):
_cache_init()
if not enable_cache:
return [url]
url_data = get_url_data(url)
to_return = [url_data]
if url_data[url].get('list') is not None:
url_redirs = url_data[url]['list']
for u in url_redirs:
if u == url:
continue
to_return.append(get_url_data(u))
return to_return