AIL-framework/bin/packages/lib_redis_insert.py

204 lines
6.0 KiB
Python

import sys, hashlib, os, os.path, gzip, string, glob, itertools, copy, shutil
import redis, crcmod, mmh3, time, fileinput
import crcmod, mmh3
from operator import itemgetter, attrgetter
from pubsublogger import publisher
def listdirectory(path):
"""Path Traversing Function.
:param path: -- The absolute pathname to a directory.
This function is returning all the absolute path of the files contained in
the argument directory.
"""
fichier=[]
for root, dirs, files in os.walk(path):
for i in files:
fichier.append(os.path.join(root, i))
return fichier
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
"""It filters out non-printable characters from the string it receives."""
def select_hash(hashkind, line):
"""Select the kind of hashing for the line.
:param hashkind: -- (str) The name of the hash
:param line: -- (str) The string to hash.
This function is a kind of hash selector which will use the hash passed
in argument to hash the string also passed in argument.
"""
if hashkind == "md5":
hashline = hashlib.md5(line).hexdigest()
elif hashkind == "sha1":
hashline = hashlib.sha1(line).hexdigest()
elif hashkind == "crc":
crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
crc32.update(line)
hashline = crc32.hexdigest()
elif hashkind == "murmur":
hashline = mmh3.hash(line)
return str(hashline)
def redis_populate(pipe, folder, minline, hashkind, jmp, insert_type):
"""Call another function with different "mode"
:param pipe: -- Redis pipe
:param folder: -- the absolute path name to the folder where to process
:param minline: -- the minimum lenght of line to hash
:param hashkind: -- the hash to use
:param jmp: -- (bool) trigger the jumping line mode or not
:param insert_type: -- which kind of datastructure to create in redis.
This Function actually call the function "insert_redis" with differents
method to process it.
In one way, x lines are jumped before the Insertion.
In another, all the line are hashed and inserted in redis.
"""
for filename in folder:
with gzip.open(filename, 'rb') as F:
start_line = 1
for num, line in enumerate(F, start_line):
if jmp != 1:
if (num % jmp) == 1 :
insert_redis(filename,
line,
pipe,
minline,
hashkind,
num,
insert_type)
else:
insert_redis(filename,
line,
pipe,
minline,
hashkind,
num,
insert_type)
pipe.execute()
def insert_redis(filename, line, pipe, minline, hashkind, num, insert_type):
"""Insert hashed line in redis.
:param filename: -- the absolute path name to the folder where to process
:param line: -- the clear line which will be hashed.
:param pipe: -- Redis pipe
:param minline: -- the minimum lenght of line to hash
:param hashkind: -- the hash to use
:param num: -- (int) the first line of the file (better human read)
:param insert_type: -- (int) Choose the datastructure used in redis.
This function insert hashed lines in the selected redis datastructure
The datastructure is represented as follow:
case one: ALLIN
"hash"[hashedline][occurence] => to index all different hashs + scoring
"hashedline"[filename.gz] => to associate the file.gz to his hashedline
"L:hashedline"[clearline] => for the correspondance
case two: SORTED SET (for the ./top.py script)
"hash"[hashedline][occurence] => to index all different hashs + scoring
"hashedline"[filename.gz] => to associate the file.gz to his hashedline
case tree: BASIC SET (for ./Graph.py)
"hash"[hashedline] to index all different hashs (without scores)
"hashedline"[filename.gz] => to associate the file.gz to his hashedline
"filename.gz"[firstline] => for human reading
"""
if (insert_type == 2): # ALLIN
if len(line) >= minline:
pipe.zincrby("hash", select_hash(hashkind, line), 1)
pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
pipe.sadd("L:"+select_hash(hashkind, line), clean(line))
if (num == 1):
pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
elif (insert_type == 1): # SORTED SET FOR TOP100.py
if len(line) >= minline:
pipe.zincrby("hash", select_hash(hashkind, line), 1)
pipe.sadd(select_hash(hashkind, line), clean(line))
elif (insert_type == 0): # SET FOR THE GRAPH
if len(line) >= minline:
pipe.sadd("hash", select_hash(hashkind, line))
pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
if (num == 1):
pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
def remove_pure_doppelganger(r_serv, nb):
"""Remove identic paste
:param r_serv: -- Redis connexion database
:param nb: -- (int) Number of execution wanted
Add to a temporary list the hash of wholes files and compare the new hash
to the element of this list. If the hash is already inside, the file
is deleted otherwise the hash is added in the list.
"""
hashlist = []
for x in xrange(0,nb):
filename = r_serv.lpop("filelist")
with open(filename, 'rb') as L:
hashline = hashlib.md5(L.read()).hexdigest()
print len(hashlist)
if hashline in hashlist:
os.remove(filename)
publisher.debug("{0} removed".format(filename))
print "{0} removed".format(filename)
else:
hashlist.append(hashline)