AIL-framework/bin/packages/lib_refine.py

311 lines
9.7 KiB
Python
Raw Normal View History

import gzip, string, sys, os, redis, re
import dns.resolver
from pubsublogger import publisher
from lib_jobs import *
from operator import itemgetter
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import calendar as cal
from datetime import date, timedelta
from dateutil.rrule import rrule, DAILY
def create_graph_by_day_datastruct(r_serv, r_key, year, month):
"""Creating a datastructure in redis.
:param r_serv: -- Redis connexion database
:param r_key: -- (str) The name of the key read in redis (often the name of
the keywords category list)
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
"""
a = date(year, month, 01)
b = date(year, month, cal.monthrange(year, month)[1])
for dt in rrule(DAILY, dtstart = a, until = b):
r_serv.zadd(r_key+'_by_day',0,dt.strftime("%Y%m%d"))
for Tfilename in r_serv.zrange(r_key+'_occur', 0, -1, withscores = True):
r_serv.zincrby(r_key+'_by_day',
Tfilename[0][-22:-12].replace('/',''),
Tfilename[1])
def is_luhn_valid(card_number):
"""Apply the Luhn algorithm to validate credit card.
:param card_number: -- (int) card number
"""
r = [int(ch) for ch in str(card_number)][::-1]
return (sum(r[0::2]) + sum(sum(divmod(d*2,10)) for d in r[1::2])) % 10 == 0
def checking_MX_record(r_serv, adress_set):
"""Check if emails MX domains are responding.
:param r_serv: -- Redis connexion database
:param adress_set: -- (set) This is a set of emails adress
:return: (int) Number of adress with a responding and valid MX domains
This function will split the email adress and try to resolve their domains
names: on example@gmail.com it will try to resolve gmail.com
"""
score = 0
num = len(adress_set)
WalidMX = set([])
# Transforming the set into a string
MXdomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", str(adress_set).lower())
if MXdomains != []:
for MXdomain in set(MXdomains):
try:
#Already in Redis living.
if r_serv.exists(MXdomain[1:]):
score += 1
WalidMX.add(MXdomain[1:])
# Not already in Redis
else:
# If I'm Walid MX domain
if dns.resolver.query(MXdomain[1:], rdtype = dns.rdatatype.MX):
# Gonna be added in redis.
r_serv.setex(MXdomain[1:],timedelta(days=1),1)
score += 1
WalidMX.add(MXdomain[1:])
else:
pass
except dns.resolver.NoNameservers:
publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
except dns.resolver.NoAnswer:
publisher.debug('NoAnswer, The response did not contain an answer to the question.')
except dns.name.EmptyLabel:
publisher.debug('SyntaxError: EmptyLabel')
except dns.resolver.NXDOMAIN:
publisher.debug('The query name does not exist.')
except dns.name.LabelTooLong:
publisher.debug('The Label is too long')
finally:
pass
publisher.debug("emails before: {0} after: {1} (valid)".format(num, score))
return (num, WalidMX)
def checking_A_record(r_serv, domains_set):
score = 0
num = len(domains_set)
WalidA = set([])
for Adomain in domains_set:
try:
#Already in Redis living.
if r_serv.exists(Adomain):
score += 1
WalidA.add(Adomain)
# Not already in Redis
else:
# If I'm Walid domain
if dns.resolver.query(Adomain, rdtype = dns.rdatatype.A):
# Gonna be added in redis.
r_serv.setex(Adomain,timedelta(days=1),1)
score += 1
WalidA.add(Adomain)
else:
pass
except dns.resolver.NoNameservers:
publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
except dns.resolver.NoAnswer:
publisher.debug('NoAnswer, The response did not contain an answer to the question.')
except dns.name.EmptyLabel:
publisher.debug('SyntaxError: EmptyLabel')
except dns.resolver.NXDOMAIN:
publisher.debug('The query name does not exist.')
except dns.name.LabelTooLong:
publisher.debug('The Label is too long')
finally:
pass
publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score))
return (num, WalidA)
def refining_regex_dataset(r_serv, r_key, regex, min_match, year, month, luhn = True, dnscheck = True):
"""Refine the "raw dataset" of paste with regulars expressions
:param r_serv: -- Redis connexion database
:param r_key: -- (str) The name of the key read in redis (often the name of
the keywords category list)
:param min_match: -- (int) Below this number file are deleted
:param regex: -- Regular expression which will be match.
This function Refine database created with classify_token_paste function.
It opening again the files which matchs the keywords category list, found
regular expression inside it and count how many time is found.
If there is not too much match about the regular expression the file is
deleted from the list.
Than it finally merge the result by day to be able to create a bar graph
which will represent how many occurence by day the regex match.
"""
for filename in r_serv.zrange(r_key, 0, -1):
with gzip.open(filename, 'rb') as F:
var = 0
matchs = set([])
for num, kword in enumerate(F):
match = re.findall(regex, kword)
var += len(match)
for y in match:
if y != '' and len(y) < 100:
matchs.add(y)
# If there is less match than min_match delete it (False pos)
if len(matchs) <= min_match :
r_serv.zrem(r_key, filename)
publisher.debug("{0} deleted".format(filename))
else:
# else changing the score.
if r_key == "creditcard_categ" and luhn:
for card_number in matchs:
if is_luhn_valid(card_number):
r_serv.zincrby(r_key+'_occur', filename, 1)
publisher.info("{1} is valid in the file {0}".format(filename, card_number))
else:
publisher.debug("{0} card is invalid".format(card_number))
if r_key == "mails_categ" and dnscheck:
r_serv.zadd(r_key+'_occur', checking_MX_record(r_serv, matchs), filename)
else:
# LUHN NOT TRIGGERED (Other Categs)
r_serv.zadd(r_key+'_occur',
len(matchs),
filename)
create_graph_by_day_datastruct(r_serv, r_key, year, month)
def graph_categ_by_day(r_serv, filename, year, month, r_key):
"""Create a bargraph representing regex matching by day
:param r_serv: -- Redis connexion database
:param filename: -- (str) The absolute path where to save the figure.png
:param r_key: -- (str) The name of the key read in redis (often the name of
the keywords category list)
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
This function display the amount of the category per day.
"""
adate = []
categ_num = []
rcParams['figure.figsize'] = 15, 10
a = date(year, month, 01)
b = date(year, month, cal.monthrange(year, month)[1])
for dt in rrule(DAILY, dtstart = a, until = b):
adate.append(dt.strftime("%d"))
categ_num.append(r_serv.zscore(r_key+'_by_day',dt.strftime("%Y%m%d")))
n_groups = len(categ_num)
adress_scores = tuple(categ_num)
index = np.arange(n_groups)
bar_width = 0.5
opacity = 0.6
ladress = plt.bar(index, adress_scores, bar_width,
alpha = opacity,
color = 'b',
label = r_key)
plt.plot(tuple(categ_num), 'r--')
#plt.yscale('log')
plt.xlabel('Days')
plt.ylabel('Amount')
plt.title('Occurence of '+r_key+' by day')
plt.xticks(index + bar_width/2 , tuple(adate))
plt.legend()
plt.grid()
plt.tight_layout()
plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
orientation='portrait', papertype=None, format="png",
transparent=False, bbox_inches=None, pad_inches=0.1,
frameon=True)
publisher.info(filename+".png"+" saved!")
def create_tld_list(url = "https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"):
"""Recover a tld list from url.
:param url: -- The url of the tld list.
:return: -- list
This function recover from mozilla.org the list of the effective tld names,
Save it as a file, and return a list of all the tld.
"""
domains = []
htmlSource = urllib.urlopen(url).read()
with open("ICCANdomain", 'wb') as F:
F.write(htmlSource)
with open("ICCANdomain", 'rb') as F:
for num, line in enumerate(F):
if re.match(r"^\/\/|\n", line) == None:
domains.append(re.sub(r'\*', '', line[:-1]))
else:
publisher.info("Comment line ignored.")
return domains