mirror of https://github.com/CIRCL/AIL-framework
311 lines
9.7 KiB
Python
311 lines
9.7 KiB
Python
|
import gzip, string, sys, os, redis, re
|
||
|
import dns.resolver
|
||
|
|
||
|
from pubsublogger import publisher
|
||
|
|
||
|
from lib_jobs import *
|
||
|
from operator import itemgetter
|
||
|
|
||
|
import numpy as np
|
||
|
import matplotlib.pyplot as plt
|
||
|
from pylab import *
|
||
|
|
||
|
import calendar as cal
|
||
|
from datetime import date, timedelta
|
||
|
from dateutil.rrule import rrule, DAILY
|
||
|
|
||
|
|
||
|
|
||
|
def create_graph_by_day_datastruct(r_serv, r_key, year, month):
|
||
|
"""Creating a datastructure in redis.
|
||
|
|
||
|
:param r_serv: -- Redis connexion database
|
||
|
:param r_key: -- (str) The name of the key read in redis (often the name of
|
||
|
the keywords category list)
|
||
|
:param year: -- (integer) The year to process
|
||
|
:param month: -- (integer) The month to process
|
||
|
|
||
|
|
||
|
"""
|
||
|
a = date(year, month, 01)
|
||
|
b = date(year, month, cal.monthrange(year, month)[1])
|
||
|
|
||
|
for dt in rrule(DAILY, dtstart = a, until = b):
|
||
|
r_serv.zadd(r_key+'_by_day',0,dt.strftime("%Y%m%d"))
|
||
|
|
||
|
for Tfilename in r_serv.zrange(r_key+'_occur', 0, -1, withscores = True):
|
||
|
r_serv.zincrby(r_key+'_by_day',
|
||
|
Tfilename[0][-22:-12].replace('/',''),
|
||
|
Tfilename[1])
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def is_luhn_valid(card_number):
|
||
|
"""Apply the Luhn algorithm to validate credit card.
|
||
|
|
||
|
:param card_number: -- (int) card number
|
||
|
|
||
|
|
||
|
"""
|
||
|
r = [int(ch) for ch in str(card_number)][::-1]
|
||
|
return (sum(r[0::2]) + sum(sum(divmod(d*2,10)) for d in r[1::2])) % 10 == 0
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def checking_MX_record(r_serv, adress_set):
|
||
|
"""Check if emails MX domains are responding.
|
||
|
|
||
|
:param r_serv: -- Redis connexion database
|
||
|
:param adress_set: -- (set) This is a set of emails adress
|
||
|
:return: (int) Number of adress with a responding and valid MX domains
|
||
|
|
||
|
This function will split the email adress and try to resolve their domains
|
||
|
names: on example@gmail.com it will try to resolve gmail.com
|
||
|
|
||
|
"""
|
||
|
score = 0
|
||
|
num = len(adress_set)
|
||
|
WalidMX = set([])
|
||
|
# Transforming the set into a string
|
||
|
MXdomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", str(adress_set).lower())
|
||
|
|
||
|
if MXdomains != []:
|
||
|
|
||
|
for MXdomain in set(MXdomains):
|
||
|
try:
|
||
|
#Already in Redis living.
|
||
|
if r_serv.exists(MXdomain[1:]):
|
||
|
score += 1
|
||
|
WalidMX.add(MXdomain[1:])
|
||
|
# Not already in Redis
|
||
|
else:
|
||
|
# If I'm Walid MX domain
|
||
|
if dns.resolver.query(MXdomain[1:], rdtype = dns.rdatatype.MX):
|
||
|
# Gonna be added in redis.
|
||
|
r_serv.setex(MXdomain[1:],timedelta(days=1),1)
|
||
|
score += 1
|
||
|
WalidMX.add(MXdomain[1:])
|
||
|
else:
|
||
|
pass
|
||
|
|
||
|
except dns.resolver.NoNameservers:
|
||
|
publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
|
||
|
|
||
|
except dns.resolver.NoAnswer:
|
||
|
publisher.debug('NoAnswer, The response did not contain an answer to the question.')
|
||
|
|
||
|
except dns.name.EmptyLabel:
|
||
|
publisher.debug('SyntaxError: EmptyLabel')
|
||
|
|
||
|
except dns.resolver.NXDOMAIN:
|
||
|
publisher.debug('The query name does not exist.')
|
||
|
|
||
|
except dns.name.LabelTooLong:
|
||
|
publisher.debug('The Label is too long')
|
||
|
|
||
|
finally:
|
||
|
pass
|
||
|
|
||
|
publisher.debug("emails before: {0} after: {1} (valid)".format(num, score))
|
||
|
return (num, WalidMX)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def checking_A_record(r_serv, domains_set):
|
||
|
score = 0
|
||
|
num = len(domains_set)
|
||
|
WalidA = set([])
|
||
|
|
||
|
for Adomain in domains_set:
|
||
|
try:
|
||
|
#Already in Redis living.
|
||
|
if r_serv.exists(Adomain):
|
||
|
score += 1
|
||
|
WalidA.add(Adomain)
|
||
|
# Not already in Redis
|
||
|
else:
|
||
|
# If I'm Walid domain
|
||
|
if dns.resolver.query(Adomain, rdtype = dns.rdatatype.A):
|
||
|
# Gonna be added in redis.
|
||
|
r_serv.setex(Adomain,timedelta(days=1),1)
|
||
|
score += 1
|
||
|
WalidA.add(Adomain)
|
||
|
else:
|
||
|
pass
|
||
|
|
||
|
except dns.resolver.NoNameservers:
|
||
|
publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
|
||
|
|
||
|
except dns.resolver.NoAnswer:
|
||
|
publisher.debug('NoAnswer, The response did not contain an answer to the question.')
|
||
|
|
||
|
except dns.name.EmptyLabel:
|
||
|
publisher.debug('SyntaxError: EmptyLabel')
|
||
|
|
||
|
except dns.resolver.NXDOMAIN:
|
||
|
publisher.debug('The query name does not exist.')
|
||
|
|
||
|
except dns.name.LabelTooLong:
|
||
|
publisher.debug('The Label is too long')
|
||
|
|
||
|
finally:
|
||
|
pass
|
||
|
|
||
|
publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score))
|
||
|
return (num, WalidA)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def refining_regex_dataset(r_serv, r_key, regex, min_match, year, month, luhn = True, dnscheck = True):
|
||
|
"""Refine the "raw dataset" of paste with regulars expressions
|
||
|
|
||
|
:param r_serv: -- Redis connexion database
|
||
|
:param r_key: -- (str) The name of the key read in redis (often the name of
|
||
|
the keywords category list)
|
||
|
:param min_match: -- (int) Below this number file are deleted
|
||
|
:param regex: -- Regular expression which will be match.
|
||
|
|
||
|
This function Refine database created with classify_token_paste function.
|
||
|
It opening again the files which matchs the keywords category list, found
|
||
|
regular expression inside it and count how many time is found.
|
||
|
|
||
|
If there is not too much match about the regular expression the file is
|
||
|
deleted from the list.
|
||
|
|
||
|
Than it finally merge the result by day to be able to create a bar graph
|
||
|
which will represent how many occurence by day the regex match.
|
||
|
|
||
|
"""
|
||
|
for filename in r_serv.zrange(r_key, 0, -1):
|
||
|
|
||
|
with gzip.open(filename, 'rb') as F:
|
||
|
var = 0
|
||
|
matchs = set([])
|
||
|
|
||
|
for num, kword in enumerate(F):
|
||
|
|
||
|
match = re.findall(regex, kword)
|
||
|
var += len(match)
|
||
|
|
||
|
for y in match:
|
||
|
if y != '' and len(y) < 100:
|
||
|
matchs.add(y)
|
||
|
# If there is less match than min_match delete it (False pos)
|
||
|
if len(matchs) <= min_match :
|
||
|
r_serv.zrem(r_key, filename)
|
||
|
publisher.debug("{0} deleted".format(filename))
|
||
|
else:
|
||
|
# else changing the score.
|
||
|
if r_key == "creditcard_categ" and luhn:
|
||
|
for card_number in matchs:
|
||
|
if is_luhn_valid(card_number):
|
||
|
|
||
|
r_serv.zincrby(r_key+'_occur', filename, 1)
|
||
|
|
||
|
publisher.info("{1} is valid in the file {0}".format(filename, card_number))
|
||
|
else:
|
||
|
publisher.debug("{0} card is invalid".format(card_number))
|
||
|
|
||
|
if r_key == "mails_categ" and dnscheck:
|
||
|
r_serv.zadd(r_key+'_occur', checking_MX_record(r_serv, matchs), filename)
|
||
|
|
||
|
else:
|
||
|
# LUHN NOT TRIGGERED (Other Categs)
|
||
|
r_serv.zadd(r_key+'_occur',
|
||
|
len(matchs),
|
||
|
filename)
|
||
|
|
||
|
create_graph_by_day_datastruct(r_serv, r_key, year, month)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def graph_categ_by_day(r_serv, filename, year, month, r_key):
|
||
|
"""Create a bargraph representing regex matching by day
|
||
|
|
||
|
:param r_serv: -- Redis connexion database
|
||
|
:param filename: -- (str) The absolute path where to save the figure.png
|
||
|
:param r_key: -- (str) The name of the key read in redis (often the name of
|
||
|
the keywords category list)
|
||
|
:param year: -- (integer) The year to process
|
||
|
:param month: -- (integer) The month to process
|
||
|
|
||
|
This function display the amount of the category per day.
|
||
|
|
||
|
"""
|
||
|
adate = []
|
||
|
categ_num = []
|
||
|
rcParams['figure.figsize'] = 15, 10
|
||
|
|
||
|
a = date(year, month, 01)
|
||
|
b = date(year, month, cal.monthrange(year, month)[1])
|
||
|
|
||
|
for dt in rrule(DAILY, dtstart = a, until = b):
|
||
|
adate.append(dt.strftime("%d"))
|
||
|
categ_num.append(r_serv.zscore(r_key+'_by_day',dt.strftime("%Y%m%d")))
|
||
|
|
||
|
n_groups = len(categ_num)
|
||
|
adress_scores = tuple(categ_num)
|
||
|
|
||
|
index = np.arange(n_groups)
|
||
|
bar_width = 0.5
|
||
|
opacity = 0.6
|
||
|
|
||
|
ladress = plt.bar(index, adress_scores, bar_width,
|
||
|
alpha = opacity,
|
||
|
color = 'b',
|
||
|
label = r_key)
|
||
|
|
||
|
|
||
|
plt.plot(tuple(categ_num), 'r--')
|
||
|
#plt.yscale('log')
|
||
|
plt.xlabel('Days')
|
||
|
plt.ylabel('Amount')
|
||
|
plt.title('Occurence of '+r_key+' by day')
|
||
|
plt.xticks(index + bar_width/2 , tuple(adate))
|
||
|
|
||
|
plt.legend()
|
||
|
plt.grid()
|
||
|
|
||
|
plt.tight_layout()
|
||
|
|
||
|
plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
|
||
|
orientation='portrait', papertype=None, format="png",
|
||
|
transparent=False, bbox_inches=None, pad_inches=0.1,
|
||
|
frameon=True)
|
||
|
|
||
|
publisher.info(filename+".png"+" saved!")
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def create_tld_list(url = "https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"):
|
||
|
"""Recover a tld list from url.
|
||
|
|
||
|
:param url: -- The url of the tld list.
|
||
|
:return: -- list
|
||
|
|
||
|
This function recover from mozilla.org the list of the effective tld names,
|
||
|
Save it as a file, and return a list of all the tld.
|
||
|
|
||
|
|
||
|
"""
|
||
|
domains = []
|
||
|
htmlSource = urllib.urlopen(url).read()
|
||
|
with open("ICCANdomain", 'wb') as F:
|
||
|
F.write(htmlSource)
|
||
|
|
||
|
with open("ICCANdomain", 'rb') as F:
|
||
|
|
||
|
for num, line in enumerate(F):
|
||
|
if re.match(r"^\/\/|\n", line) == None:
|
||
|
domains.append(re.sub(r'\*', '', line[:-1]))
|
||
|
else:
|
||
|
publisher.info("Comment line ignored.")
|
||
|
|
||
|
return domains
|