mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			311 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			311 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
| import gzip, string, sys, os, redis, re
 | |
| import dns.resolver
 | |
| 
 | |
| from pubsublogger import publisher
 | |
| 
 | |
| from lib_jobs import *
 | |
| from operator import itemgetter
 | |
| 
 | |
| import numpy as np
 | |
| import matplotlib.pyplot as plt
 | |
| from pylab import *
 | |
| 
 | |
| import calendar as cal
 | |
| from datetime import date, timedelta
 | |
| from dateutil.rrule import rrule, DAILY
 | |
| 
 | |
| 
 | |
| 
 | |
| def create_graph_by_day_datastruct(r_serv, r_key, year, month):
 | |
|     """Creating a datastructure in redis.
 | |
| 
 | |
|     :param r_serv: -- Redis connexion database
 | |
|     :param r_key: -- (str) The name of the key read in redis (often the name of
 | |
|     the keywords category list)
 | |
|     :param year: -- (integer) The year to process
 | |
|     :param month: -- (integer) The month to process
 | |
| 
 | |
| 
 | |
|     """
 | |
|     a = date(year, month, 01)
 | |
|     b = date(year, month, cal.monthrange(year, month)[1])
 | |
| 
 | |
|     for dt in rrule(DAILY, dtstart = a, until = b):
 | |
|         r_serv.zadd(r_key+'_by_day',0,dt.strftime("%Y%m%d"))
 | |
| 
 | |
|     for Tfilename in r_serv.zrange(r_key+'_occur', 0, -1, withscores = True):
 | |
|         r_serv.zincrby(r_key+'_by_day',
 | |
|         Tfilename[0][-22:-12].replace('/',''),
 | |
|         Tfilename[1])
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def is_luhn_valid(card_number):
 | |
|     """Apply the Luhn algorithm to validate credit card.
 | |
| 
 | |
|     :param card_number: -- (int) card number
 | |
| 
 | |
| 
 | |
|     """
 | |
|     r = [int(ch) for ch in str(card_number)][::-1]
 | |
|     return (sum(r[0::2]) + sum(sum(divmod(d*2,10)) for d in r[1::2])) % 10 == 0
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def checking_MX_record(r_serv, adress_set):
 | |
|     """Check if emails MX domains are responding.
 | |
| 
 | |
|     :param r_serv: -- Redis connexion database
 | |
|     :param adress_set: -- (set) This is a set of emails adress
 | |
|     :return: (int) Number of adress with a responding and valid MX domains
 | |
| 
 | |
|     This function will split the email adress and try to resolve their domains
 | |
|     names: on example@gmail.com it will try to resolve gmail.com
 | |
| 
 | |
|     """
 | |
|     score = 0
 | |
|     num = len(adress_set)
 | |
|     WalidMX = set([])
 | |
|     # Transforming the set into a string
 | |
|     MXdomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", str(adress_set).lower())
 | |
| 
 | |
|     if MXdomains != []:
 | |
| 
 | |
|             for MXdomain in set(MXdomains):
 | |
|                 try:
 | |
|                     #Already in Redis living.
 | |
|                     if r_serv.exists(MXdomain[1:]):
 | |
|                         score += 1
 | |
|                         WalidMX.add(MXdomain[1:])
 | |
|                     # Not already in Redis
 | |
|                     else:
 | |
|                         # If I'm Walid MX domain
 | |
|                         if dns.resolver.query(MXdomain[1:], rdtype = dns.rdatatype.MX):
 | |
|                             # Gonna be added in redis.
 | |
|                             r_serv.setex(MXdomain[1:],timedelta(days=1),1)
 | |
|                             score += 1
 | |
|                             WalidMX.add(MXdomain[1:])
 | |
|                         else:
 | |
|                             pass
 | |
| 
 | |
|                 except dns.resolver.NoNameservers:
 | |
|                     publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
 | |
| 
 | |
|                 except dns.resolver.NoAnswer:
 | |
|                     publisher.debug('NoAnswer, The response did not contain an answer to the question.')
 | |
| 
 | |
|                 except dns.name.EmptyLabel:
 | |
|                     publisher.debug('SyntaxError: EmptyLabel')
 | |
| 
 | |
|                 except dns.resolver.NXDOMAIN:
 | |
|                     publisher.debug('The query name does not exist.')
 | |
| 
 | |
|                 except dns.name.LabelTooLong:
 | |
|                     publisher.debug('The Label is too long')
 | |
| 
 | |
|                 finally:
 | |
|                     pass
 | |
| 
 | |
|     publisher.debug("emails before: {0} after: {1} (valid)".format(num, score))
 | |
|     return (num, WalidMX)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def checking_A_record(r_serv, domains_set):
 | |
|     score = 0
 | |
|     num = len(domains_set)
 | |
|     WalidA = set([])
 | |
| 
 | |
|     for Adomain in domains_set:
 | |
|         try:
 | |
|             #Already in Redis living.
 | |
|             if r_serv.exists(Adomain):
 | |
|                 score += 1
 | |
|                 WalidA.add(Adomain)
 | |
|             # Not already in Redis
 | |
|             else:
 | |
|                 # If I'm Walid domain
 | |
|                 if dns.resolver.query(Adomain, rdtype = dns.rdatatype.A):
 | |
|                     # Gonna be added in redis.
 | |
|                     r_serv.setex(Adomain,timedelta(days=1),1)
 | |
|                     score += 1
 | |
|                     WalidA.add(Adomain)
 | |
|                 else:
 | |
|                     pass
 | |
| 
 | |
|         except dns.resolver.NoNameservers:
 | |
|             publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
 | |
| 
 | |
|         except dns.resolver.NoAnswer:
 | |
|             publisher.debug('NoAnswer, The response did not contain an answer to the question.')
 | |
| 
 | |
|         except dns.name.EmptyLabel:
 | |
|             publisher.debug('SyntaxError: EmptyLabel')
 | |
| 
 | |
|         except dns.resolver.NXDOMAIN:
 | |
|             publisher.debug('The query name does not exist.')
 | |
| 
 | |
|         except dns.name.LabelTooLong:
 | |
|             publisher.debug('The Label is too long')
 | |
| 
 | |
|         finally:
 | |
|             pass
 | |
| 
 | |
|     publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score))
 | |
|     return (num, WalidA)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def refining_regex_dataset(r_serv, r_key, regex, min_match, year, month, luhn = True, dnscheck = True):
 | |
|     """Refine the "raw dataset" of paste with regulars expressions
 | |
| 
 | |
|     :param r_serv: -- Redis connexion database
 | |
|     :param r_key: -- (str) The name of the key read in redis (often the name of
 | |
|         the keywords category list)
 | |
|     :param min_match: -- (int) Below this number file are deleted
 | |
|     :param regex: -- Regular expression which will be match.
 | |
| 
 | |
|     This function Refine database created with classify_token_paste function.
 | |
|     It opening again the files which matchs the keywords category list, found
 | |
|     regular expression inside it and count how many time is found.
 | |
| 
 | |
|     If there is not too much match about the regular expression the file is
 | |
|     deleted from the list.
 | |
| 
 | |
|     Than it finally merge the result by day to be able to create a bar graph
 | |
|     which will represent how many occurence by day the regex match.
 | |
| 
 | |
|     """
 | |
|     for filename in r_serv.zrange(r_key, 0, -1):
 | |
| 
 | |
|         with gzip.open(filename, 'rb') as F:
 | |
|             var = 0
 | |
|             matchs = set([])
 | |
| 
 | |
|             for num, kword in enumerate(F):
 | |
| 
 | |
|                 match = re.findall(regex, kword)
 | |
|                 var += len(match)
 | |
| 
 | |
|                 for y in match:
 | |
|                     if y != '' and len(y) < 100:
 | |
|                         matchs.add(y)
 | |
|             # If there is less match than min_match delete it (False pos)
 | |
|             if len(matchs) <= min_match :
 | |
|                 r_serv.zrem(r_key, filename)
 | |
|                 publisher.debug("{0} deleted".format(filename))
 | |
|             else:
 | |
|             # else changing the score.
 | |
|                 if r_key == "creditcard_categ" and luhn:
 | |
|                     for card_number in matchs:
 | |
|                         if is_luhn_valid(card_number):
 | |
| 
 | |
|                             r_serv.zincrby(r_key+'_occur', filename, 1)
 | |
| 
 | |
|                             publisher.info("{1} is valid in the file {0}".format(filename, card_number))
 | |
|                         else:
 | |
|                             publisher.debug("{0} card is invalid".format(card_number))
 | |
| 
 | |
|                 if r_key == "mails_categ" and dnscheck:
 | |
|                     r_serv.zadd(r_key+'_occur', checking_MX_record(r_serv, matchs), filename)
 | |
| 
 | |
|                 else:
 | |
|                     # LUHN NOT TRIGGERED (Other Categs)
 | |
|                     r_serv.zadd(r_key+'_occur',
 | |
|                         len(matchs),
 | |
|                         filename)
 | |
| 
 | |
|     create_graph_by_day_datastruct(r_serv, r_key, year, month)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def graph_categ_by_day(r_serv, filename, year, month, r_key):
 | |
|     """Create a bargraph representing regex matching by day
 | |
| 
 | |
|     :param r_serv: -- Redis connexion database
 | |
|     :param filename: -- (str) The absolute path where to save the figure.png
 | |
|     :param r_key: -- (str) The name of the key read in redis (often the name of
 | |
|         the keywords category list)
 | |
|     :param year: -- (integer) The year to process
 | |
|     :param month: -- (integer) The month to process
 | |
| 
 | |
|     This function display the amount of the category per day.
 | |
| 
 | |
|     """
 | |
|     adate = []
 | |
|     categ_num = []
 | |
|     rcParams['figure.figsize'] = 15, 10
 | |
| 
 | |
|     a = date(year, month, 01)
 | |
|     b = date(year, month, cal.monthrange(year, month)[1])
 | |
| 
 | |
|     for dt in rrule(DAILY, dtstart = a, until = b):
 | |
|         adate.append(dt.strftime("%d"))
 | |
|         categ_num.append(r_serv.zscore(r_key+'_by_day',dt.strftime("%Y%m%d")))
 | |
| 
 | |
|     n_groups = len(categ_num)
 | |
|     adress_scores = tuple(categ_num)
 | |
| 
 | |
|     index = np.arange(n_groups)
 | |
|     bar_width = 0.5
 | |
|     opacity = 0.6
 | |
| 
 | |
|     ladress = plt.bar(index, adress_scores, bar_width,
 | |
|                  alpha = opacity,
 | |
|                  color = 'b',
 | |
|                  label = r_key)
 | |
| 
 | |
| 
 | |
|     plt.plot(tuple(categ_num), 'r--')
 | |
|     #plt.yscale('log')
 | |
|     plt.xlabel('Days')
 | |
|     plt.ylabel('Amount')
 | |
|     plt.title('Occurence of '+r_key+' by day')
 | |
|     plt.xticks(index + bar_width/2 , tuple(adate))
 | |
| 
 | |
|     plt.legend()
 | |
|     plt.grid()
 | |
| 
 | |
|     plt.tight_layout()
 | |
| 
 | |
|     plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
 | |
|         orientation='portrait', papertype=None, format="png",
 | |
|         transparent=False, bbox_inches=None, pad_inches=0.1,
 | |
|         frameon=True)
 | |
| 
 | |
|     publisher.info(filename+".png"+" saved!")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def create_tld_list(url = "https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"):
 | |
|     """Recover a tld list from url.
 | |
| 
 | |
|     :param url: -- The url of the tld list.
 | |
|     :return: -- list
 | |
| 
 | |
|     This function recover from mozilla.org the list of the effective tld names,
 | |
|     Save it as a file, and return a list of all the tld.
 | |
| 
 | |
| 
 | |
|     """
 | |
|     domains = []
 | |
|     htmlSource = urllib.urlopen(url).read()
 | |
|     with open("ICCANdomain", 'wb') as F:
 | |
|         F.write(htmlSource)
 | |
| 
 | |
|     with open("ICCANdomain", 'rb') as F:
 | |
| 
 | |
|         for num, line in enumerate(F):
 | |
|             if re.match(r"^\/\/|\n", line) == None:
 | |
|                 domains.append(re.sub(r'\*', '', line[:-1]))
 | |
|             else:
 | |
|                 publisher.info("Comment line ignored.")
 | |
| 
 | |
|     return domains
 |