From 04a8f1bdf20bb3f65ee8fb268fc093d2baf5fbf9 Mon Sep 17 00:00:00 2001 From: Starow Date: Thu, 14 Aug 2014 11:48:46 +0200 Subject: [PATCH] maxi cleanup old code :'( --- bin/{tests => }/indexer_lookup.py | 0 bin/packages/lib_gephi.py | 64 ---- bin/packages/lib_jobs.py | 151 --------- bin/packages/lib_redis_insert.py | 203 ------------ bin/packages/lib_refine.py | 178 ----------- bin/packages/lib_search.py | 103 ------ bin/packages/lib_words.py | 490 +---------------------------- bin/tests/Bargraph.py | 56 ---- bin/tests/Bargraph_categ_by_day.py | 64 ---- bin/tests/Classify_Paste_Token.py | 61 ---- bin/tests/Display_pid.py | 46 --- bin/tests/Graph.py | 65 ---- bin/tests/Interset.py | 52 --- bin/tests/Populate.py | 75 ----- bin/tests/Refine_with_regex.py | 78 ----- bin/tests/Remove_Doppelganger.py | 44 --- bin/tests/Remove_longline_fp.py | 57 ---- bin/tests/Search.py | 72 ----- bin/tests/Top.py | 58 ---- bin/tests/WordsCurve_Populate.py | 64 ---- bin/tests/WordsCurves.py | 57 ---- bin/tests/Wordsranking_Populate.py | 54 ---- 22 files changed, 14 insertions(+), 2078 deletions(-) rename bin/{tests => }/indexer_lookup.py (100%) delete mode 100644 bin/packages/lib_gephi.py delete mode 100644 bin/packages/lib_jobs.py delete mode 100644 bin/packages/lib_redis_insert.py delete mode 100644 bin/packages/lib_search.py delete mode 100755 bin/tests/Bargraph.py delete mode 100755 bin/tests/Bargraph_categ_by_day.py delete mode 100755 bin/tests/Classify_Paste_Token.py delete mode 100755 bin/tests/Display_pid.py delete mode 100755 bin/tests/Graph.py delete mode 100755 bin/tests/Interset.py delete mode 100755 bin/tests/Populate.py delete mode 100755 bin/tests/Refine_with_regex.py delete mode 100755 bin/tests/Remove_Doppelganger.py delete mode 100755 bin/tests/Remove_longline_fp.py delete mode 100755 bin/tests/Search.py delete mode 100755 bin/tests/Top.py delete mode 100755 bin/tests/WordsCurve_Populate.py delete mode 100755 bin/tests/WordsCurves.py delete mode 100755 bin/tests/Wordsranking_Populate.py diff --git a/bin/tests/indexer_lookup.py b/bin/indexer_lookup.py similarity index 100% rename from bin/tests/indexer_lookup.py rename to bin/indexer_lookup.py diff --git a/bin/packages/lib_gephi.py b/bin/packages/lib_gephi.py deleted file mode 100644 index 6fe35492..00000000 --- a/bin/packages/lib_gephi.py +++ /dev/null @@ -1,64 +0,0 @@ -import networkx as nx -import xml.sax.saxutils as xlm -import redis - -def Gephi_Graph(r_serv, graphpath, mincard, maxcard, insert_type): - """Create Gephi Graph by calling a "Sub function": Create_Graph - - :param r_serv: -- connexion to redis database - :param graphpath: -- the absolute path of the .gephi graph created. - :param mincard: -- the minimum links between 2 nodes to be created - :param maxcard: -- the maximum links between 2 nodes to be created - :param insert_type: -- the type of datastructure used to create the graph. - - In fact this function is juste here to be able to choose between two kind of - Redis database structure: One which is a Sorted set and the other a simple - set. - - """ - g = nx.Graph() - - if (insert_type == 0): - - for h in r_serv.smembers("hash"): - Create_Graph(r_serv, g, h, graphpath, mincard, maxcard) - - elif (insert_type == 2): - - for h in r_serv.zrange("hash", 0, -1): - Create_Graph(r_serv, g, h, graphpath, mincard, maxcard) - - nx.write_gexf(g,graphpath) - print nx.info(g) - - - - -def Create_Graph(r_serv, graph, h, graphpath, mincard, maxcard): - """Create Gephi Graph. - - :param r_serv: -- connexion to redis database - :param graph: -- networkx graph object - :param h: -- (str) the hash which will be transform into a node. - :param graphpath: -- the absolute path of the .gephi graph created. - :param mincard: -- the minimum links between 2 nodes to be created - :param maxcard: -- the maximum links between 2 nodes to be created - - This function link all the pastes with theirs own hashed lines. - Of course a paste can have multiple hashed lines and an hashed line can be - contained in multiple paste. - In this case it's a common hash. - - """ - if (r_serv.scard(h) >= mincard) and (r_serv.scard(h) <= maxcard): - - for filename in r_serv.smembers(h): - - for line in r_serv.smembers(filename): - - line = line.decode('UTF-8', errors='ignore') - line = xlm.quoteattr(line, {'"':'"', "'":"'"}) - - graph.add_edge(h, line+" -- "+filename) - -#OK \ No newline at end of file diff --git a/bin/packages/lib_jobs.py b/bin/packages/lib_jobs.py deleted file mode 100644 index 67a5ab3e..00000000 --- a/bin/packages/lib_jobs.py +++ /dev/null @@ -1,151 +0,0 @@ -import redis, time, sys, os, inspect - -from datetime import timedelta, date, datetime - -from pubsublogger import publisher - -def set_listof_pid(r_serv, filename, name): - """Create the pid list and it's pid members - - :param r_serv: -- Connexion to redis. - :param filename: -- the absolute pastes path name. - :param name: -- the traditionnal argv[0] (The name of the launched script) - - This function create a hashes in redis as follows and a set of pid. - - +------------+------------+---------------------+ - | Keys | Fields | Values | - +============+============+=====================+ - | 2045 | startime | 2014-05-09_11:44:17 | - +------------+------------+---------------------+ - | ... | prog | ./programme | - +------------+------------+---------------------+ - | ... | pid | 2045 | - +------------+------------+---------------------+ - | ... | paste | /home/folder/aux.gz | - +------------+------------+---------------------+ - | ... | kb | 54.12 | - +------------+------------+---------------------+ - - +------------+------------+ - | Keys | Members | - +============+============+ - | pid | 2045 | - +------------+------------+ - | ... | 2480 | - +------------+------------+ - - """ - r_serv.sadd("pid", os.getpid()) - r_serv.hmset(os.getpid(), - { - "startime":time.strftime("%Y-%m-%d_%H:%M:%S"), - "prog":name, - "pid":str(os.getpid()), - "paste":filename, - "Kb":round(os.path.getsize(filename)/1024.0,2) - }) - - - - -def update_listof_pid(r_serv): - """Remove pid from the pid list - - :param r_serv: -- Connexion to redis. - - Remove from the list and redis, pid which are terminated. - - """ - r_serv.srem("pid", os.getpid()) - r_serv.delete(os.getpid()) - - - - -def flush_list_of_pid(r_serv): - """Flush the datas in redis - - :param r_serv: -- Connexion to redis. - - Clean the redis database from the previous pid and pidlist inserted - - """ - for x in r_serv.smembers("pid"): - r_serv.delete(x) - - r_serv.delete("pid") - - - - -def format_display_listof_pid(dico, arg): - """Formating data for shell and human - - :param dico: (dict) dictionnary - :param arg: (str) Choosing argument - - :returns: (str) - - This function provide different displaying formats for the dictionnary's data. - - """ - if arg == 'pid': - var = "{0}".format(dico['pid']) - elif arg == 'up': - var = "{0}".format(dico['uptime']) - elif arg == 'kb': - var = "{0}".format(dico['Kb']) - elif arg == 'paste': - var = "{0}".format(dico['paste']) - elif arg == 'startime': - var = "{0}".format(dico['startime']) - elif arg == 'prg': - var = "{0}".format(dico['prog']) - else: - var = "PID:{0},uptime:{1},kb:{2},paste:{3},prog:{4},startime:{5}".format(dico['pid'], - dico['uptime'], - dico['Kb'], - dico['paste'], - dico['prog'], - dico['startime']) - - return var - - - - -def display_listof_pid(r_serv, arg): - """Display the pid list from redis - - This function display infos in the shell about lauched process - - """ - jobs = {} - joblist = [] - try: - for job in r_serv.smembers("pid"): - jobs = r_serv.hgetall(job) - - if jobs != None: - start = datetime.strptime(r_serv.hget(job, "startime"), "%Y-%m-%d_%H:%M:%S") - - end = datetime.strptime(time.strftime("%Y-%m-%d_%H:%M:%S"), "%Y-%m-%d_%H:%M:%S") - jobs['uptime'] = str(abs(start - end)) - joblist.append(jobs) - else: - publisher.debug("display_list_of_pid Aborted due to lack of Information in Redis") - - joblist = sorted(joblist, key=lambda k: k['uptime'], reverse=True) - - for job in joblist: - print format_display_listof_pid(job, arg) - - if arg == "remain": - print "Remaining: {0}".format(r_serv.llen("filelist")) - - if arg == "processed": - print "processed: {0}".format(r_serv.llen("processed")) - - except TypeError: - publisher.error("TypeError for display_listof_pid") diff --git a/bin/packages/lib_redis_insert.py b/bin/packages/lib_redis_insert.py deleted file mode 100644 index 3896e28d..00000000 --- a/bin/packages/lib_redis_insert.py +++ /dev/null @@ -1,203 +0,0 @@ -import sys, hashlib, os, os.path, gzip, string, glob, itertools, copy, shutil -import redis, crcmod, mmh3, time, fileinput -import crcmod, mmh3 - -from operator import itemgetter, attrgetter -from pubsublogger import publisher - - - - -def listdirectory(path): - """Path Traversing Function. - - :param path: -- The absolute pathname to a directory. - - This function is returning all the absolute path of the files contained in - the argument directory. - - """ - fichier=[] - for root, dirs, files in os.walk(path): - - for i in files: - - fichier.append(os.path.join(root, i)) - - return fichier - - - - -clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) -"""It filters out non-printable characters from the string it receives.""" - - - -def select_hash(hashkind, line): - """Select the kind of hashing for the line. - - :param hashkind: -- (str) The name of the hash - :param line: -- (str) The string to hash. - - This function is a kind of hash selector which will use the hash passed - in argument to hash the string also passed in argument. - - """ - if hashkind == "md5": - hashline = hashlib.md5(line).hexdigest() - - elif hashkind == "sha1": - hashline = hashlib.sha1(line).hexdigest() - - elif hashkind == "crc": - crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF) - crc32.update(line) - hashline = crc32.hexdigest() - - elif hashkind == "murmur": - hashline = mmh3.hash(line) - - return str(hashline) - - - - -def redis_populate(pipe, folder, minline, hashkind, jmp, insert_type): - """Call another function with different "mode" - - :param pipe: -- Redis pipe - :param folder: -- the absolute path name to the folder where to process - :param minline: -- the minimum lenght of line to hash - :param hashkind: -- the hash to use - :param jmp: -- (bool) trigger the jumping line mode or not - :param insert_type: -- which kind of datastructure to create in redis. - - This Function actually call the function "insert_redis" with differents - method to process it. - In one way, x lines are jumped before the Insertion. - In another, all the line are hashed and inserted in redis. - - """ - for filename in folder: - - with gzip.open(filename, 'rb') as F: - start_line = 1 - - for num, line in enumerate(F, start_line): - - if jmp != 1: - - if (num % jmp) == 1 : - insert_redis(filename, - line, - pipe, - minline, - hashkind, - num, - insert_type) - - else: - insert_redis(filename, - line, - pipe, - minline, - hashkind, - num, - insert_type) - - pipe.execute() - - - - -def insert_redis(filename, line, pipe, minline, hashkind, num, insert_type): - """Insert hashed line in redis. - - :param filename: -- the absolute path name to the folder where to process - :param line: -- the clear line which will be hashed. - :param pipe: -- Redis pipe - :param minline: -- the minimum lenght of line to hash - :param hashkind: -- the hash to use - :param num: -- (int) the first line of the file (better human read) - :param insert_type: -- (int) Choose the datastructure used in redis. - - This function insert hashed lines in the selected redis datastructure - The datastructure is represented as follow: - - case one: ALLIN - "hash"[hashedline][occurence] => to index all different hashs + scoring - "hashedline"[filename.gz] => to associate the file.gz to his hashedline - "L:hashedline"[clearline] => for the correspondance - - case two: SORTED SET (for the ./top.py script) - "hash"[hashedline][occurence] => to index all different hashs + scoring - "hashedline"[filename.gz] => to associate the file.gz to his hashedline - - case tree: BASIC SET (for ./Graph.py) - "hash"[hashedline] to index all different hashs (without scores) - "hashedline"[filename.gz] => to associate the file.gz to his hashedline - "filename.gz"[firstline] => for human reading - - """ - if (insert_type == 2): # ALLIN - if len(line) >= minline: - - pipe.zincrby("hash", select_hash(hashkind, line), 1) - pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1]) - pipe.sadd("L:"+select_hash(hashkind, line), clean(line)) - - if (num == 1): - - pipe.sadd(filename.split('/',20)[-1], clean(line[0:80])) - - - elif (insert_type == 1): # SORTED SET FOR TOP100.py - - if len(line) >= minline: - - pipe.zincrby("hash", select_hash(hashkind, line), 1) - pipe.sadd(select_hash(hashkind, line), clean(line)) - - - elif (insert_type == 0): # SET FOR THE GRAPH - - if len(line) >= minline: - - pipe.sadd("hash", select_hash(hashkind, line)) - pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1]) - - if (num == 1): - - pipe.sadd(filename.split('/',20)[-1], clean(line[0:80])) - - - - -def remove_pure_doppelganger(r_serv, nb): - """Remove identic paste - - :param r_serv: -- Redis connexion database - :param nb: -- (int) Number of execution wanted - - Add to a temporary list the hash of wholes files and compare the new hash - to the element of this list. If the hash is already inside, the file - is deleted otherwise the hash is added in the list. - - """ - hashlist = [] - for x in xrange(0,nb): - filename = r_serv.lpop("filelist") - - with open(filename, 'rb') as L: - hashline = hashlib.md5(L.read()).hexdigest() - - print len(hashlist) - - if hashline in hashlist: - - os.remove(filename) - publisher.debug("{0} removed".format(filename)) - print "{0} removed".format(filename) - else: - hashlist.append(hashline) diff --git a/bin/packages/lib_refine.py b/bin/packages/lib_refine.py index 3d4d4867..1fe458b8 100644 --- a/bin/packages/lib_refine.py +++ b/bin/packages/lib_refine.py @@ -15,32 +15,6 @@ from datetime import date, timedelta from dateutil.rrule import rrule, DAILY - -def create_graph_by_day_datastruct(r_serv, r_key, year, month): - """Creating a datastructure in redis. - - :param r_serv: -- Redis connexion database - :param r_key: -- (str) The name of the key read in redis (often the name of - the keywords category list) - :param year: -- (integer) The year to process - :param month: -- (integer) The month to process - - - """ - a = date(year, month, 01) - b = date(year, month, cal.monthrange(year, month)[1]) - - for dt in rrule(DAILY, dtstart = a, until = b): - r_serv.zadd(r_key+'_by_day',0,dt.strftime("%Y%m%d")) - - for Tfilename in r_serv.zrange(r_key+'_occur', 0, -1, withscores = True): - r_serv.zincrby(r_key+'_by_day', - Tfilename[0][-22:-12].replace('/',''), - Tfilename[1]) - - - - def is_luhn_valid(card_number): """Apply the Luhn algorithm to validate credit card. @@ -156,155 +130,3 @@ def checking_A_record(r_serv, domains_set): publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score)) return (num, WalidA) - - - - -def refining_regex_dataset(r_serv, r_key, regex, min_match, year, month, luhn = True, dnscheck = True): - """Refine the "raw dataset" of paste with regulars expressions - - :param r_serv: -- Redis connexion database - :param r_key: -- (str) The name of the key read in redis (often the name of - the keywords category list) - :param min_match: -- (int) Below this number file are deleted - :param regex: -- Regular expression which will be match. - - This function Refine database created with classify_token_paste function. - It opening again the files which matchs the keywords category list, found - regular expression inside it and count how many time is found. - - If there is not too much match about the regular expression the file is - deleted from the list. - - Than it finally merge the result by day to be able to create a bar graph - which will represent how many occurence by day the regex match. - - """ - for filename in r_serv.zrange(r_key, 0, -1): - - with gzip.open(filename, 'rb') as F: - var = 0 - matchs = set([]) - - for num, kword in enumerate(F): - - match = re.findall(regex, kword) - var += len(match) - - for y in match: - if y != '' and len(y) < 100: - matchs.add(y) - # If there is less match than min_match delete it (False pos) - if len(matchs) <= min_match : - r_serv.zrem(r_key, filename) - publisher.debug("{0} deleted".format(filename)) - else: - # else changing the score. - if r_key == "creditcard_categ" and luhn: - for card_number in matchs: - if is_luhn_valid(card_number): - - r_serv.zincrby(r_key+'_occur', filename, 1) - - publisher.info("{1} is valid in the file {0}".format(filename, card_number)) - else: - publisher.debug("{0} card is invalid".format(card_number)) - - if r_key == "mails_categ" and dnscheck: - r_serv.zadd(r_key+'_occur', checking_MX_record(r_serv, matchs), filename) - - else: - # LUHN NOT TRIGGERED (Other Categs) - r_serv.zadd(r_key+'_occur', - len(matchs), - filename) - - create_graph_by_day_datastruct(r_serv, r_key, year, month) - - - - -def graph_categ_by_day(r_serv, filename, year, month, r_key): - """Create a bargraph representing regex matching by day - - :param r_serv: -- Redis connexion database - :param filename: -- (str) The absolute path where to save the figure.png - :param r_key: -- (str) The name of the key read in redis (often the name of - the keywords category list) - :param year: -- (integer) The year to process - :param month: -- (integer) The month to process - - This function display the amount of the category per day. - - """ - adate = [] - categ_num = [] - rcParams['figure.figsize'] = 15, 10 - - a = date(year, month, 01) - b = date(year, month, cal.monthrange(year, month)[1]) - - for dt in rrule(DAILY, dtstart = a, until = b): - adate.append(dt.strftime("%d")) - categ_num.append(r_serv.zscore(r_key+'_by_day',dt.strftime("%Y%m%d"))) - - n_groups = len(categ_num) - adress_scores = tuple(categ_num) - - index = np.arange(n_groups) - bar_width = 0.5 - opacity = 0.6 - - ladress = plt.bar(index, adress_scores, bar_width, - alpha = opacity, - color = 'b', - label = r_key) - - - plt.plot(tuple(categ_num), 'r--') - #plt.yscale('log') - plt.xlabel('Days') - plt.ylabel('Amount') - plt.title('Occurence of '+r_key+' by day') - plt.xticks(index + bar_width/2 , tuple(adate)) - - plt.legend() - plt.grid() - - plt.tight_layout() - - plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b', - orientation='portrait', papertype=None, format="png", - transparent=False, bbox_inches=None, pad_inches=0.1, - frameon=True) - - publisher.info(filename+".png"+" saved!") - - - - -def create_tld_list(url = "https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"): - """Recover a tld list from url. - - :param url: -- The url of the tld list. - :return: -- list - - This function recover from mozilla.org the list of the effective tld names, - Save it as a file, and return a list of all the tld. - - - """ - domains = [] - htmlSource = urllib.urlopen(url).read() - with open("ICCANdomain", 'wb') as F: - F.write(htmlSource) - - with open("ICCANdomain", 'rb') as F: - - for num, line in enumerate(F): - if re.match(r"^\/\/|\n", line) == None: - domains.append(re.sub(r'\*', '', line[:-1])) - else: - publisher.info("Comment line ignored.") - - return domains diff --git a/bin/packages/lib_search.py b/bin/packages/lib_search.py deleted file mode 100644 index 8ec8973b..00000000 --- a/bin/packages/lib_search.py +++ /dev/null @@ -1,103 +0,0 @@ -import redis -import string - - -def create_common_hash_file(r_serv, zmin, zmax, filename): - """ Create a "top100".txt file. - - :param r_serv: -- connexion to redis database - :param zmin: -- (int) Offset of the top list - :param zmax: -- (int) Number of element wanted to be in the top list. - :param filename: -- the pathname to the created file. - - This Function create a ranking list between zmin and zman of the most common - hashs. - Line are written as follow in the file: - hash:[md5hash]:[cardinality]:[line] - All hashes represent a full line which mean it can be one char or more... - - """ - with open(filename, 'wb') as F: - - for h, num in r_serv.zrevrangebyscore("hash", "+inf", "-inf", zmin, zmax, True): - - F.write("hash:{0}:{1}:{2}\n".format(h, num, list(r_serv.smembers('L:'+h)))) - - - - -def paste_searching(r_serv, filename, pastename, mincard, maxcard): - """Search similar hashs from a given file. - - :param r_serv: -- connexion to redis database - :param filename: -- the pathname to the created file. - :param pastename: -- the name of the paste used to search in redis database. - :param mincard: -- the minimum occurence needed of an hash to be taken in count. - :param maxcard: -- the maximum occurence needed of an hash to be taken in count. - - This function return a text file which is a kind of synthesis about - where (in the others pastes) the hash of the given pastename have been found. - - """ - P = set([pastename]) - tmp_h = str() - tmp_set = set([]) - - with open(filename, 'wb') as F: - - F.write("Paste: {0}\nOptions used:\nMincard: {1}\nMaxcard: {2}\n\nContaining Following Hash:\n".format(pastename,mincard,maxcard)) - - for h in r_serv.smembers("hash"): - - if (r_serv.smembers(h).intersection(P) and r_serv.scard(h) >= mincard and r_serv.scard(h) <= maxcard): - - F.write(h+'\n') - tmp_set = tmp_set.union(r_serv.smembers(h).union(r_serv.smembers(tmp_h))) - - tmp_h = h - - F.write("\nSimilar Files:\n") - - for n, s in enumerate(tmp_set): - - F.write(str(n) + ': ' + s + '\n') - - - - -def paste_searching2(r_serv, filename, pastename, mincard, maxcard): - """Search similar hashs from a given file. - (On another kind of redis data structure) - - :param r_serv: -- connexion to redis database - :param filename: -- the pathname to the created file. - :param pastename: -- the name of the paste used to search in redis database. - :param mincard: -- the minimum occurence needed of an hash to be taken in count. - :param maxcard: -- the maximum occurence needed of an hash to be taken in count. - - This function return a text file which is a kind of synthesis about - where (in the others pastes) the hash of the given pastename have been found. - - """ - P = set([pastename]) - tmp_h = str() - tmp_set = set([]) - - with open(filename, 'wb') as F: - - F.write("Paste: {0}\nOptions used:\nMincard: {1}\nMaxcard: {2}\n\n###Containing Following Hash:### ###Occur### ###### Corresponding Line ######\n".format(pastename,mincard,maxcard)) - - for h in r_serv.zrange("hash", 0, -1): - - if (r_serv.smembers(h).intersection(P) and r_serv.scard(h) >= mincard and r_serv.scard(h) <= maxcard): - - F.write(h + ' -- ' + str(r_serv.zscore("hash",h)) + ' -- ' + str(list(r_serv.smembers('L:' + h))) + '\n') - tmp_set = tmp_set.union(r_serv.smembers(h).union(r_serv.smembers(tmp_h))) - - tmp_h = h - - F.write("\nSimilar Files:\n") - - for n, s in enumerate(tmp_set): - - F.write(str(n) + ': ' + s + '\n') diff --git a/bin/packages/lib_words.py b/bin/packages/lib_words.py index 7920f74c..e72fef52 100644 --- a/bin/packages/lib_words.py +++ b/bin/packages/lib_words.py @@ -19,316 +19,30 @@ from dateutil.rrule import rrule, DAILY from packages import * -def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength): - """Looping function - :param pipe: -- Redis pipe. - :param nb: -- (int) Number of pastes proceeded by function - :param minlength: -- (int) passed to the next function - :param maxlength: -- (int) passed to the next function +def listdirectory(path): + """Path Traversing Function. + + :param path: -- The absolute pathname to a directory. + + This function is returning all the absolute path of the files contained in + the argument directory. """ - try: - for n in xrange(0,nb): + fichier=[] + for root, dirs, files in os.walk(path): - path = r_serv.lpop("filelist") + for i in files: - if path != None: - set_listof_pid(r_serv, path, sys.argv[0]) + fichier.append(os.path.join(root, i)) - redis_zincr_words(pipe, path, minlength, maxlength) + return fichier - update_listof_pid(r_serv) - r_serv.lpush("processed",path) - publisher.debug(path) - else: - publisher.debug("Empty list") - break - except (KeyboardInterrupt, SystemExit) as e: - flush_list_of_pid(r_serv) - publisher.debug("Pid list flushed") - - - - -def redis_zincr_words(pipe, filename, minlength, maxlength): - """Create news sorted set in redis. - - :param minlength: -- (int) Minimum words length inserted - :param maxlength: -- (int) Maximum words length inserted - :param filename: -- The absolute path to the file.gz to process. - - Representation of the set in redis: - - +------------+------------+-----------+ - | Keys | Members | Scores | - +============+============+===========+ - | 20131001 | word1 | 142 | - +------------+------------+-----------+ - | ... | word2 | 120 | - +------------+------------+-----------+ - | 20131002 | ... | ... | - +------------+------------+-----------+ - - This function store all words between minlength and maxlength in redis. - Redis will count as well how much time each word will appear by day: - The cardinality. - - """ - tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True) - - with gzip.open(filename, 'rb') as F: - - blob = TextBlob(clean(F.read()), tokenizer = tokenizer) - - for word in blob.tokens: - - if (len(word) >= minlength) and (len(word) <= maxlength): - pipe.zincrby(filename[-22:-12].replace('/',''), word, 1) - - if (len(word) >= maxlength): - publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename)) - publisher.info(word) - - pipe.execute() - - - - -def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set): - """Tokenizing on word category - - :param r_serv: -- Redis database connexion - :param listname: -- (str) path to the file containing the list of path of category files - :param choicedatastruct: -- (bool) Changing the index of datastructure - :param nb: -- (int) Number of pastes proceeded by function - - Redis data structures cas be choose as follow: - - +---------------+------------+-----------+ - | Keys | Members | Scores | - +===============+============+===========+ - | mails_categ | filename | 25000 | - +---------------+------------+-----------+ - | ... | filename2 | 2400 | - +---------------+------------+-----------+ - | web_categ | ... | ... | - +---------------+------------+-----------+ - - Or - - +--------------+-------------+-----------+ - | Keys | Members | Scores | - +==============+=============+===========+ - | filename | mails_categ | 100000 | - +--------------+-------------+-----------+ - | ... | web_categ | 24050 | - +--------------+-------------+-----------+ - | filename2 | ... | ... | - +--------------+-------------+-----------+ - - This function tokenise on all special characters like: @^\|[{#~}]!:;$^= - And insert data in redis if the token match the keywords in a list previously - created. - These lists of keywords can be list of everything you want but it's better - to create "category" of keywords. - - """ - - try: - for n in xrange(0,nb): - filename = r_serv.lpop(r_set) - - if filename != None: - - tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True) - set_listof_pid(r_serv, filename, sys.argv[0]) - - with open(listname, 'rb') as L: - # for each "categ" listed in the file - for num, fname in enumerate(L): - # contain keywords by categ - tmp_list = [] - #for each keywords - with open(fname[:-1], 'rb') as LS: - - for num, kword in enumerate(LS): - tmp_list.append(kword[:-1]) - - # for each paste - with gzip.open(filename, 'rb') as F: - - blob = TextBlob(clean(F.read()), - tokenizer = tokenizer) - - # for each paste token - for word in blob.tokens.lower(): - - if word in tmp_list: - # choosing between two data structures. - if choicedatastruct: - r_serv.zincrby(filename, - fname.split('/')[-1][:-1], - 1) - else: - r_serv.zincrby(fname.split('/')[-1][:-1], - filename, - 1) - - update_listof_pid(r_serv) - - else: - publisher.debug("Empty list") - #r_serv.save() - break - - except (KeyboardInterrupt, SystemExit) as e: - flush_list_of_pid(r_serv) - publisher.debug("Pid list flushed") - - - - -def dectect_longlines(r_serv, r_key, store = False, maxlength = 500): - """Store longlines's linenumbers in redis - - :param r_serv: -- The redis connexion database - :param r_key: -- (str) The key name in redis - :param store: -- (bool) Store the line numbers or not. - :param maxlength: -- The limit between "short lines" and "long lines" - - This function connect to a redis list of filename (pastes filename); - Open the paste and check inside if there is some line with their - length >= to maxlength. - If yes, the paste is "tagged" as containing a longlines in another - redis structures, and the linenumber (of the long lines) can be stored - in addition if the argument store is at True. - - """ - try: - while True: - #r_key_list (categ) - filename = r_serv.lpop(r_key) - - if filename != None: - - set_listof_pid(r_serv, filename, sys.argv[0]) - - # for each pastes - with gzip.open(filename, 'rb') as F: - var = True - for num, line in enumerate(F): - - if len(line) >= maxlength: - #publisher.debug("Longline:{0}".format(line)) - if var: - r_serv.rpush("longlines", filename) - var = False - - if store: - r_serv.sadd(filename, num) - else: - publisher.debug("Line numbers of longlines not stored") - - update_listof_pid(r_serv) - else: - publisher.debug("Empty list") - return False - break - - except (KeyboardInterrupt, SystemExit) as e: - flush_list_of_pid(r_serv) - publisher.debug("Pid list flushed") - - - - -# NOT USED RIGHT NOW # -def recovering_longlines(r_serv): - """Get longlines with linenumbers - - """ - try: - for n in xrange(0,nb): - filename = r_serv.lpop("longlines") - - if filename != None: - # For each values in redis (longline's line number) - for numline in r_serv.smembers(filename): - - with gzip.open(filename,'rb') as F: - - for num, line in enumerate(F): - #When corresponding. - if int(num) == int(numline): - pass - # TREATMENT - else: - publisher.debug("Empty list") - r_serv.save() - break - - except (KeyboardInterrupt, SystemExit) as e: - flush_list_of_pid(r_serv) - publisher.debug("Pid list flushed") - - - - -def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength): - """Remove from a set, file with long lines. - - :param r_serv: -- The redis connexion database - :param r_key: -- (str) The key name in redis - :param store: -- (bool) Store the line numbers or not. - :param delete: -- (bool) If true, delete the used key from redis. - :param maxlength: -- The limit between "short lines" and "long lines" - - """ - publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key))) - - #Create a list of file to proceed (1) - for filename in r_serv.zrange(r_key, 0, -1): - r_serv.rpush(r_key+"_list", filename) - - #detecting longlines in pastes - dectect_longlines(r_serv, r_key+"_list", store, maxlength) - - #remove false positive members - while True: - fp_filename = r_serv.lpop("longlines") - - if fp_filename == None: - break - - else: - # if wanted, delete in addition the set with linenumbers (created with store) - if delete: - r_serv.zrem(r_key, fp_filename) - r_serv.delete(fp_filename) - - else: - #remove the file with longline from the r_key zset. - r_serv.zrem(r_key, fp_filename) - - publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key))) - - - - -def detect_longline_from_list(r_serv, nb): - try: - for n in xrange(0,nb): - - if not dectect_longlines(r_serv, "filelist", True): - break - - except (KeyboardInterrupt, SystemExit) as e: - flush_list_of_pid(r_serv) - publisher.debug("Pid list flushed") +clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) +"""It filters out non-printable characters from the string it receives.""" @@ -369,182 +83,6 @@ def create_dirfile(r_serv, directory, overwrite): -def redis_interbargraph_set(r_serv, year, month, overwrite): - """Create a Redis sorted set. - - :param r_serv: -- connexion to redis database - :param year: -- (integer) The year to process - :param month: -- (integer) The month to process - :param overwrite: -- (bool) trigger the overwrite mode - - This function create inside redis the intersection of all days in - a month two by two. - Example: - For a month of 31days it will create 30 sorted set between day and - day+1 until the last day. - The overwrite mode delete the intersets and re-create them. - - """ - a = date(year, month, 01) - b = date(year, month, cal.monthrange(year, month)[1]) - - if overwrite: - r_serv.delete("InterSet") - - for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)): - dayafter = dt+timedelta(1) - - r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) - - r_serv.zinterstore( - str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")), - {str(dt.strftime("%Y%m%d")):1, - str(dayafter.strftime("%Y%m%d")):-1}) - - r_serv.zadd( - "InterSet", - 1, - str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) - else: - for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)): - dayafter = dt+timedelta(1) - - if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0: - - r_serv.zinterstore( - str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")), - {str(dt.strftime("%Y%m%d")):1, - str(dayafter.strftime("%Y%m%d")):-1}) - - r_serv.zadd( - "InterSet", - 1, - str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) - - publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created") - - else: - publisher.warning("Data already exist, operation aborted.") - - - - - -def word_bar_graph(r_serv, year, month, filename): - """Create an histogram. - - :param r_serv: -- connexion to redis database - :param year: -- (integer) The year to process - :param month: -- (integer) The month to process - :param filename: -- The absolute path where to save the figure.png - - This function use matplotlib to create an histogram. - The redis database need obviously to be populated first - with functions: redis_words_ranking and redis_interbargraph_set. - - """ - lw = [] - adate = [] - inter = [0] - rcParams['figure.figsize'] = 15, 10 - - a = date(year, month, 01) - b = date(year, month, cal.monthrange(year,month)[1]) - - for dt in rrule(DAILY, dtstart = a, until = b): - lw.append(r_serv.zcard(dt.strftime("%Y%m%d"))) - adate.append(dt.strftime("%d")) - - for x in r_serv.zrange("InterSet", 0, 31): - inter.append(r_serv.zcard(x)) - - n_groups = len(lw) - card_words = tuple(lw) - card_interword = tuple(inter) - - index = np.arange(n_groups) - bar_width = 0.5 - opacity = 0.6 - - words = plt.bar(index, card_words, bar_width, - alpha=opacity, - color='g', - label='Words/day') - - lwords = plt.bar(index - 0.5, card_interword, bar_width, - alpha=opacity, - color='r', - label='Intersection') - - - plt.plot(tuple(inter), 'b--') - plt.xlabel(str(year)+'/'+str(month)+' Days') - plt.ylabel('Words') - plt.title('Words Cardinality & Intersection Histogram') - plt.xticks(index + bar_width/2 , tuple(adate)) - - plt.legend() - plt.grid() - - plt.tight_layout() - - plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b', - orientation='portrait', papertype=None, format="png", - transparent=False, bbox_inches=None, pad_inches=0.1, - frameon=True) - - publisher.info(filename+".png"+" saved!") - - - - -def create_data_words_curve(r_serv, r_serv2, year, month, filename): - """Create a Redis hashes. - - :param r_serv: -- connexion to redis database (read) - :param r_serv2: -- connexion to redis database (write) - :param year: -- (integer) The year to process - :param month: -- (integer) The month to process - :param filename: -- the path to the file which contain a list of words. - - - The hashes of redis is created as follow: - - +------------+------------+-----------+ - | Keys | Field | Values | - +============+============+===========+ - | word1 | 20131001 | 150 | - +------------+------------+-----------+ - | ... | 20131002 | 145 | - +------------+------------+-----------+ - | word2 | ... | ... | - +------------+------------+-----------+ - - The filename need to be a list of words separated by a carriage return - with an empty line at the end. - This function create datas which is used by the function - create_curve_with_word_file which create a csv file. - - """ - stop = stopwords.words('english') - a = date(year, month, 01) - b = date(year, month, cal.monthrange(year,month)[1]) - - with open(filename, 'rb') as F: - - for line in F: - - for dt in rrule(DAILY, dtstart = a, until = b): - - if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None: - #tester si ca existe deja "en option" et ajouter un WARNING log - r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])}) - else: - pass - - - - def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month): """Create a csv file used with dygraph. diff --git a/bin/tests/Bargraph.py b/bin/tests/Bargraph.py deleted file mode 100755 index f0dc4cd1..00000000 --- a/bin/tests/Bargraph.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information Leak - framework. It create an histogram which display the occurency - of the words per day but also the intersection of day and day-1 of these - occurencies''', - epilog = '''The Redis database need to be populated by the script - Wordsranking_Populate.py before using this one.''') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('y', - type = int, - metavar = "year", - help = 'The year processed.', - action = 'store') - - parser.add_argument('m', - type = int, - metavar = "month", - help = 'The month processed.', - action = 'store') - - parser.add_argument('-f', - type = str, - metavar = "filename", - default = "figure", - help = 'The absolute path name of the "figure.png"', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - word_bar_graph(r,args.y,args.m, args.f) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Bargraph_categ_by_day.py b/bin/tests/Bargraph_categ_by_day.py deleted file mode 100755 index 72ef2d6c..00000000 --- a/bin/tests/Bargraph_categ_by_day.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_refine import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information Leak - framework. It create an histogram which display the occurency - of the word category per days.''', - epilog = '''The Redis database need to be populated by the script - Classify_Paste_Token.py before. - It's also usefull to launch Remove_longline_fp.py and Refine_with_regex.py - to create a more accurate histogram. - example: ./Bargraph_categ_by_day.py 2013 12 mails_categ''') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-f', - type = str, - metavar = "filename", - default = "figure", - help = 'The absolute path name of the "figure.png"', - action = 'store') - - parser.add_argument('y', - type = int, - metavar = "year", - help = 'The year processed', - action = 'store') - - parser.add_argument('m', - type = int, - metavar = "month", - help = 'The month processed', - action = 'store') - - parser.add_argument('key', - type = str, - help ='name of the key to process in redis (the word_categ concerned)', - action = 'store') - - args = parser.parse_args() - - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - graph_categ_by_day(r, args.f, args.y, args.m, args.key) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Classify_Paste_Token.py b/bin/tests/Classify_Paste_Token.py deleted file mode 100755 index e744ed35..00000000 --- a/bin/tests/Classify_Paste_Token.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * -from pubsublogger import publisher - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information Leak - framework. It create sets in redis as much as category - defined in the file given by the argument -l ''', - epilog = '''Example : seq 5000 | parallel -n0 -j 10 - ./classify_Paste_Token.py -nbp 200''') - - parser.add_argument('-l', - type = str, - default = "../files/list_categ_files", - help = 'Path to the list_categ_files (../files/list_categ_files)', - action = 'store') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-s', - help = 'Datastruct type, swapping between keys & members', - action = 'store_true') - - parser.add_argument('-nbp', - type = int, - default = 200, - help = 'Nbpaste', - action = 'store') - - parser.add_argument('-set', - type = str, - default = 'filelist', - help = 'The name of the list in redis which contain the filename to tokenise', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - publisher.channel = "youpi" - - classify_token_paste(r, args.l, args.s, args.nbp, args.set) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Display_pid.py b/bin/tests/Display_pid.py deleted file mode 100755 index a41008fc..00000000 --- a/bin/tests/Display_pid.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * -from pubsublogger import publisher - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information Leak - framework. It's here to monitor some script which take time - and lauched in parallel, You can display which process is running on which - paste and how much time it spent processing it''', - epilog = 'example : ./Display_pid -p pid -db 1 -d remain') - - parser.add_argument('-d', - type = str, - default = 'all', - choices=['paste', 'up', 'start', 'kb', 'all', 'pid', 'prg', 'remain', 'processed'], - help = 'Which info to display ?', - action = 'store') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - publisher.channel = "youpi" - - display_listof_pid(r, args.d) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Graph.py b/bin/tests/Graph.py deleted file mode 100755 index d5a43181..00000000 --- a/bin/tests/Graph.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_gephi import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information - Leak framework. It create a gephi graph to have a global - view of the pastes but also which one are similar.''', - epilog = '''The Redis database need to be populated by the script - Populate.py before using this one.''') - - parser.add_argument('-t', - type = int, - default = 0, - help = 'Type of the Redis population (Same arg than in Populate.py)', - choices=[0, 2], - action = 'store') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-min', - type = int, - default = 3, - help = 'minimum linked nodes (default 3)', - action = 'store') - - parser.add_argument('-max', - type = int, - default = 50, - help = 'maximum linked nodes created (execute top.py before for more info)', - action = 'store') - - parser.add_argument('-p', - type = str, - default = '../graph/out', - metavar = 'path', - help = "pathname of the graph file created. ex: /home/graph", - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db, - unix_socket_path='/tmp/redis.sock') - - - Gephi_Graph(r, args.p+".gexf", args.min, args.max, args.t) - cprint("GRAPH CREATED AT:{0}.gexf".format(args.p),"green") - -if __name__ == "__main__": - main() - -#OK \ No newline at end of file diff --git a/bin/tests/Interset.py b/bin/tests/Interset.py deleted file mode 100755 index 82ea6496..00000000 --- a/bin/tests/Interset.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information - Leak framework. It create in redis the intersection - between all the days two by two of the date given in argument.''', - epilog = '''The Redis database need to be populated by the script - Wordsranking_Populate.py before using this one.''') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('y', - type = int, - metavar = "year", - help = 'The year', - action = 'store') - - parser.add_argument('m', - type = int, - metavar = "month", - help = 'The month', - action = 'store') - - parser.add_argument('-ow', - help = 'trigger the overwritting mode', - action = 'store_true') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - redis_interbargraph_set(r, args.y, args.m, args.ow) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Populate.py b/bin/tests/Populate.py deleted file mode 100755 index 8de61b74..00000000 --- a/bin/tests/Populate.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_redis_insert import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information - Leak framework. Is Populate the Redis database with - the pastes names and theirs hashed line''', - epilog = '''This script need to be run first in order to use the others: - Graph.py, Search.py, Top.py ...''') - - parser.add_argument('input', - type = str, - metavar = 'pathfolder', - help = 'Input folder', - action = 'store') - - parser.add_argument('-t', - type = int, - default = 0, - help = 'type of population wanted 0 = set 1 = zset 2 = mix', - choices=[0, 1, 2], - action = 'store') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-H', - type = str, - default = 'md5', - metavar='hash', - help = 'The hash method (default md5)', - choices=["md5", "sha1", "crc", "murmur"], - action = 'store') - - parser.add_argument('-jmp', - type = int, - default = 10, - metavar = 'jump', - help = '''Jumping line factor. 1 = All the line are taken. X = jump X line - (default 10)''', - action = 'store') - - parser.add_argument('-ml', - type = int, - default = 1, - metavar = 'minlnline', - help = '''Length line factor. 1 = All the line are taken. - X = each line >= X char (default 1)''', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline() - - redis_populate(p, listdirectory(args.input), args.ml, args.H, args.jmp, args.t) - -if __name__ == "__main__": - main() - -#OK \ No newline at end of file diff --git a/bin/tests/Refine_with_regex.py b/bin/tests/Refine_with_regex.py deleted file mode 100755 index cd2d7e5a..00000000 --- a/bin/tests/Refine_with_regex.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_refine import * -from packages.imported import * -from pubsublogger import publisher - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information - Leak framework. Is refining a redis set by - re analysing set with regex and changing the score by the number of - regex matching''', - epilog = '''example of use: ./Refine_with_regex.py 2013 12 -regex mail - -key mails_categ''') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-nbm', - type = int, - default = 1, - help = 'Minimum matching regex occurence per file to keep in redis (1)', - action = 'store') - - parser.add_argument('-regex', - type = str, - default = 'mail', - choices=['mail', 'card', 'url', 'bitcoin'], - help = 'Which regex wanted to be use to match', - action = 'store') - - parser.add_argument('-key', - type = str, - default = "mails_categ", - help = 'Name of the key to process in redis (same name than the wordlist concerned)', - action = 'store') - - parser.add_argument('y', - type = int, - metavar = "year", - help = 'The year processed', - action = 'store') - - parser.add_argument('m', - type = int, - metavar = "month", - help = 'The month processed', - action = 'store') - - args = parser.parse_args() - - if args.regex == 'mail': - regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" - elif args.regex == 'card': - regex = "4[0-9]{12}(?:[0-9]{3})?" - elif args.regex == 'bitcoin': - regex = "[13][1-9A-HJ-NP-Za-km-z]{26,33}" - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - publisher.channel = "youpi" - - refining_regex_dataset(r, args.key, regex, args.nbm, args.y, args.m) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Remove_Doppelganger.py b/bin/tests/Remove_Doppelganger.py deleted file mode 100755 index 08266af8..00000000 --- a/bin/tests/Remove_Doppelganger.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_redis_insert import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information - Leak framework. It Add to a temporary list the hash - of wholes files and compare the new hash to the element of this - list. If the hash is already inside, the file is deleted - otherwise the hash is added in the list.''', - epilog = '''This script need Redis to be populated before by - ./Dir.py''') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-nbp', - type = int, - default = 200, - help = 'nbpaste', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - remove_pure_doppelganger(r, args.nbp) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/bin/tests/Remove_longline_fp.py b/bin/tests/Remove_longline_fp.py deleted file mode 100755 index ec00a711..00000000 --- a/bin/tests/Remove_longline_fp.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information Leak - framework. It removes the line which are in redis under - the "key" name argument''', - epilog = '''This script is usually usefull launched after using - ./Classify_Paste_Token.py example: ./Remove_longline_fp.py mails_categ''') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('key', - type = str, - help = 'Name of the key to process in redis ("")', - action = 'store') - - parser.add_argument('-d', - help = 'Delete the set of longline created?', - action = 'store_true') - - parser.add_argument('-s', - help = 'Store the longline numbers inside a set?', - action = 'store_true') - - parser.add_argument('-max', - type = int, - default = 500, - help = 'The limit between "short lines" and "long lines" (500)', - action = 'store') - - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - #remove_longline_from_categ(r, args.key, args.d, args.s, args.max) - detect_longline_from_list(r,args.max) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/bin/tests/Search.py b/bin/tests/Search.py deleted file mode 100755 index 1ea0f408..00000000 --- a/bin/tests/Search.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_search import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = 'Analysis Information Leak framework', - epilog = 'MSc Student Internship') - - parser.add_argument('-db', - default = 0, - type = int, - help = 'The name of the Redis DB', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('name', - type = str, - metavar = 'pastename', - help = 'The name of the paste', - action = 'store') - - parser.add_argument('-min', - type = int, - default = 3, - help = 'minimum linked hashs (default 3)', - action = 'store') - - parser.add_argument('-max', - type = int, - default = 50, - help = 'maximum linked hash (execute top.py to be more aware)', - action = 'store') - - parser.add_argument('-p', - type = str, - default = '../graph/Search_', - metavar = 'path', - help = "pathname of the file created.", - action = 'store') - - parser.add_argument('-t', - type = int, - default = 0, - help = 'Type of the Redis population (Same arg than in Populate.py)', - choices=[0, 2], - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db, - unix_socket_path='/tmp/redis.sock') - - - if args.t == 2: - paste_searching2(r, args.p+args.name+".txt", args.name, args.min, args.max) - cprint("GRAPH CREATED AT:{0}{1}.txt".format(args.p,args.name),"green") - elif args.t == 0: - paste_searching(r, args.p+args.name+".txt", args.name, args.min, args.max) - cprint("GRAPH CREATED AT:{0}{1}.txt".format(args.p,args.name),"green") - pass - - -if __name__ == "__main__": - main() diff --git a/bin/tests/Top.py b/bin/tests/Top.py deleted file mode 100755 index 3af48d6f..00000000 --- a/bin/tests/Top.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_search import Create_Common_Hash_File -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = '''This script is a part of the Analysis Information Leak - framework. It create a text file with the top common hash - which are in the redis database''', - epilog = '''The Redis database need to be populated by the script - Populate.py before using this one.''') - - parser.add_argument('-db', - default = 0, - type = int, - help = 'The name of the Redis DB', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-off', - default = 1, - type = int, - metavar = 'offset', - help = 'Starting point of the toplist', - action = 'store') - - parser.add_argument('-top', - default = 100, - type = int, - metavar = '100', - help = 'How many occurence? top 10-50-100 ?', - action = 'store') - - parser.add_argument('-p', - type = str, - default = '../graph/top', - metavar = 'path', - help = "pathname of the file created ex: /home/top", - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - create_common_cash_file(r, args.off, args.top, args.p+str(args.top)+".top") - cprint("LIST CREATED","green") - -if __name__ == "__main__": - main() - -#OK \ No newline at end of file diff --git a/bin/tests/WordsCurve_Populate.py b/bin/tests/WordsCurve_Populate.py deleted file mode 100755 index 2ca6b71c..00000000 --- a/bin/tests/WordsCurve_Populate.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = 'Analysis Information Leak framework', - epilog = 'Thats drawing') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB To get the info (0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-db1', - type = int, - default = 1, - help = 'The name of the Redis DB To store (1)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('f', - type = str, - metavar= "file", - help = 'Words filename', - action = 'store') - - parser.add_argument('y', - type = int, - metavar = "year", - help = 'The year', - action = 'store') - - parser.add_argument('m', - type = int, - metavar = "month", - help = 'The month', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - r2 = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db1) - - - p = r.pipeline(False) - - create_data_words_curve(r, r2, args.y, args.m, args.f) - -if __name__ == "__main__": - main() diff --git a/bin/tests/WordsCurves.py b/bin/tests/WordsCurves.py deleted file mode 100755 index 5796f111..00000000 --- a/bin/tests/WordsCurves.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = 'Analysis Information Leak framework', - epilog = 'Thats drawing') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-cvs', - type = str, - metavar = "filename", - default = "wordstrendingdata", - help = 'The name of the cvs file wanted to be created', - action = 'store') - - parser.add_argument('f', - type = str, - help = 'The file with the list of words', - action = 'store') - - parser.add_argument('y', - type = int, - metavar = "year", - help = 'The year', - action = 'store') - - parser.add_argument('m', - type = int, - metavar = "month", - help = 'The month', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - create_curve_with_word_file(r, args.cvs, args.f, args.y, args.m) - -if __name__ == "__main__": - main() diff --git a/bin/tests/Wordsranking_Populate.py b/bin/tests/Wordsranking_Populate.py deleted file mode 100755 index 0b0d0e7b..00000000 --- a/bin/tests/Wordsranking_Populate.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/python2.7 -# -*-coding:UTF-8 -* - -from packages.lib_words import * -from packages.imported import * -from pubsublogger import publisher - -def main(): - """Main Function""" - - parser = argparse.ArgumentParser( - description = 'Analysis Information Leak framework', - epilog = 'example : seq 2 | parallel ./Wordsranking_Populate.py -nbp 20') - - parser.add_argument('-nbp', - type = int, - default = 200, - help = 'nbpaste', - action = 'store') - - parser.add_argument('-db', - type = int, - default = 0, - help = 'The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], - action = 'store') - - parser.add_argument('-min', - type = int, - default = 4, - help = 'Minimum length of the inserted words (default 4)', - action = 'store') - - parser.add_argument('-max', - type = int, - default = 200, - help = 'Maximum length of the inserted words (default 200)', - action = 'store') - - args = parser.parse_args() - - r = redis.StrictRedis( - host='localhost', - port=6379, - db=args.db) - - p = r.pipeline(False) - - publisher.channel = "youpi" - - redis_words_ranking(p, r, args.nbp, args.min, args.max) - -if __name__ == "__main__": - main()