mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			204 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			204 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			Python
		
	
	
| import sys, hashlib, os, os.path, gzip, string, glob, itertools, copy, shutil
 | |
| import redis, crcmod, mmh3, time, fileinput
 | |
| import crcmod, mmh3
 | |
| 
 | |
| from operator import itemgetter, attrgetter
 | |
| from pubsublogger import publisher
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def listdirectory(path):
 | |
|     """Path Traversing Function.
 | |
| 
 | |
|     :param path: -- The absolute pathname to a directory.
 | |
| 
 | |
|     This function is returning all the absolute path of the files contained in
 | |
|     the argument directory.
 | |
| 
 | |
|     """
 | |
|     fichier=[]
 | |
|     for root, dirs, files in os.walk(path):
 | |
| 
 | |
|         for i in files:
 | |
| 
 | |
|             fichier.append(os.path.join(root, i))
 | |
| 
 | |
|     return fichier
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
 | |
| """It filters out non-printable characters from the string it receives."""
 | |
| 
 | |
| 
 | |
| 
 | |
| def select_hash(hashkind, line):
 | |
|     """Select the kind of hashing for the line.
 | |
| 
 | |
|     :param hashkind: -- (str) The name of the hash
 | |
|     :param line: -- (str) The string to hash.
 | |
| 
 | |
|     This function is a kind of hash selector which will use the hash passed
 | |
|     in argument to hash the string also passed in argument.
 | |
| 
 | |
|     """
 | |
|     if hashkind == "md5":
 | |
|         hashline = hashlib.md5(line).hexdigest()
 | |
| 
 | |
|     elif hashkind == "sha1":
 | |
|         hashline = hashlib.sha1(line).hexdigest()
 | |
| 
 | |
|     elif hashkind == "crc":
 | |
|         crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
 | |
|         crc32.update(line)
 | |
|         hashline = crc32.hexdigest()
 | |
| 
 | |
|     elif hashkind == "murmur":
 | |
|         hashline = mmh3.hash(line)
 | |
| 
 | |
|     return str(hashline)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def redis_populate(pipe, folder, minline, hashkind, jmp, insert_type):
 | |
|     """Call another function with different "mode"
 | |
| 
 | |
|     :param pipe: -- Redis pipe
 | |
|     :param folder: -- the absolute path name to the folder where to process
 | |
|     :param minline: -- the minimum lenght of line to hash
 | |
|     :param hashkind: -- the hash to use
 | |
|     :param jmp: -- (bool) trigger the jumping line mode or not
 | |
|      :param insert_type: -- which kind of datastructure to create in redis.
 | |
| 
 | |
|      This Function actually call the function "insert_redis" with differents
 | |
|      method to process it.
 | |
|      In one way, x lines are jumped before the Insertion.
 | |
|      In another, all the line are hashed and inserted in redis.
 | |
| 
 | |
|     """
 | |
|     for filename in folder:
 | |
| 
 | |
|         with gzip.open(filename, 'rb') as F:
 | |
|             start_line = 1
 | |
| 
 | |
|             for num, line in enumerate(F, start_line):
 | |
| 
 | |
|                 if jmp != 1:
 | |
| 
 | |
|                     if (num % jmp) == 1 :
 | |
|                         insert_redis(filename,
 | |
|                             line,
 | |
|                             pipe,
 | |
|                             minline,
 | |
|                             hashkind,
 | |
|                             num,
 | |
|                             insert_type)
 | |
| 
 | |
|                 else:
 | |
|                     insert_redis(filename,
 | |
|                         line,
 | |
|                         pipe,
 | |
|                         minline,
 | |
|                         hashkind,
 | |
|                         num,
 | |
|                         insert_type)
 | |
| 
 | |
|             pipe.execute()
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def insert_redis(filename, line, pipe, minline, hashkind, num, insert_type):
 | |
|     """Insert hashed line in redis.
 | |
| 
 | |
|     :param filename: -- the absolute path name to the folder where to process
 | |
|     :param line: -- the clear line which will be hashed.
 | |
|     :param pipe: -- Redis pipe
 | |
|     :param minline: -- the minimum lenght of line to hash
 | |
|     :param hashkind: -- the hash to use
 | |
|     :param num: -- (int) the first line of the file (better human read)
 | |
|     :param insert_type: -- (int) Choose the datastructure used in redis.
 | |
| 
 | |
|     This function insert hashed lines in the selected redis datastructure
 | |
|     The datastructure is represented as follow:
 | |
| 
 | |
|     case one: ALLIN
 | |
|     "hash"[hashedline][occurence] => to index all different hashs + scoring
 | |
|     "hashedline"[filename.gz] => to associate the file.gz to his hashedline
 | |
|     "L:hashedline"[clearline] => for the correspondance
 | |
| 
 | |
|     case two: SORTED SET (for the ./top.py script)
 | |
|     "hash"[hashedline][occurence] => to index all different hashs + scoring
 | |
|     "hashedline"[filename.gz] => to associate the file.gz to his hashedline
 | |
| 
 | |
|     case tree: BASIC SET (for ./Graph.py)
 | |
|     "hash"[hashedline] to index all different hashs (without scores)
 | |
|     "hashedline"[filename.gz] => to associate the file.gz to his hashedline
 | |
|     "filename.gz"[firstline] => for human reading
 | |
| 
 | |
|     """
 | |
|     if (insert_type == 2): # ALLIN
 | |
|         if len(line) >= minline:
 | |
| 
 | |
|             pipe.zincrby("hash", select_hash(hashkind, line), 1)
 | |
|             pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
 | |
|             pipe.sadd("L:"+select_hash(hashkind, line), clean(line))
 | |
| 
 | |
|             if (num == 1):
 | |
| 
 | |
|                 pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
 | |
| 
 | |
| 
 | |
|     elif (insert_type == 1): # SORTED SET FOR TOP100.py
 | |
| 
 | |
|         if len(line) >= minline:
 | |
| 
 | |
|             pipe.zincrby("hash", select_hash(hashkind, line), 1)
 | |
|             pipe.sadd(select_hash(hashkind, line), clean(line))
 | |
| 
 | |
| 
 | |
|     elif (insert_type == 0): # SET FOR THE GRAPH
 | |
| 
 | |
|         if len(line) >= minline:
 | |
| 
 | |
|             pipe.sadd("hash", select_hash(hashkind, line))
 | |
|             pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
 | |
| 
 | |
|             if (num == 1):
 | |
| 
 | |
|                 pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| def remove_pure_doppelganger(r_serv, nb):
 | |
|     """Remove identic paste
 | |
| 
 | |
|     :param r_serv: -- Redis connexion database
 | |
|     :param nb: -- (int) Number of execution wanted
 | |
| 
 | |
|     Add to a temporary list the hash of wholes files and compare the new hash
 | |
|     to the element of this list. If the hash is already inside, the file
 | |
|     is deleted otherwise the hash is added in the list.
 | |
| 
 | |
|     """
 | |
|     hashlist = []
 | |
|     for x in xrange(0,nb):
 | |
|         filename = r_serv.lpop("filelist")
 | |
| 
 | |
|         with open(filename, 'rb') as L:
 | |
|             hashline = hashlib.md5(L.read()).hexdigest()
 | |
| 
 | |
|             print len(hashlist)
 | |
| 
 | |
|             if hashline in hashlist:
 | |
| 
 | |
|                 os.remove(filename)
 | |
|                 publisher.debug("{0} removed".format(filename))
 | |
|                 print "{0} removed".format(filename)
 | |
|             else:
 | |
|                 hashlist.append(hashline)
 |