mirror of https://github.com/CIRCL/AIL-framework
maxi cleanup old code :'(
parent
7a3c216787
commit
04a8f1bdf2
|
@ -1,64 +0,0 @@
|
||||||
import networkx as nx
|
|
||||||
import xml.sax.saxutils as xlm
|
|
||||||
import redis
|
|
||||||
|
|
||||||
def Gephi_Graph(r_serv, graphpath, mincard, maxcard, insert_type):
|
|
||||||
"""Create Gephi Graph by calling a "Sub function": Create_Graph
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param graphpath: -- the absolute path of the .gephi graph created.
|
|
||||||
:param mincard: -- the minimum links between 2 nodes to be created
|
|
||||||
:param maxcard: -- the maximum links between 2 nodes to be created
|
|
||||||
:param insert_type: -- the type of datastructure used to create the graph.
|
|
||||||
|
|
||||||
In fact this function is juste here to be able to choose between two kind of
|
|
||||||
Redis database structure: One which is a Sorted set and the other a simple
|
|
||||||
set.
|
|
||||||
|
|
||||||
"""
|
|
||||||
g = nx.Graph()
|
|
||||||
|
|
||||||
if (insert_type == 0):
|
|
||||||
|
|
||||||
for h in r_serv.smembers("hash"):
|
|
||||||
Create_Graph(r_serv, g, h, graphpath, mincard, maxcard)
|
|
||||||
|
|
||||||
elif (insert_type == 2):
|
|
||||||
|
|
||||||
for h in r_serv.zrange("hash", 0, -1):
|
|
||||||
Create_Graph(r_serv, g, h, graphpath, mincard, maxcard)
|
|
||||||
|
|
||||||
nx.write_gexf(g,graphpath)
|
|
||||||
print nx.info(g)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def Create_Graph(r_serv, graph, h, graphpath, mincard, maxcard):
|
|
||||||
"""Create Gephi Graph.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param graph: -- networkx graph object
|
|
||||||
:param h: -- (str) the hash which will be transform into a node.
|
|
||||||
:param graphpath: -- the absolute path of the .gephi graph created.
|
|
||||||
:param mincard: -- the minimum links between 2 nodes to be created
|
|
||||||
:param maxcard: -- the maximum links between 2 nodes to be created
|
|
||||||
|
|
||||||
This function link all the pastes with theirs own hashed lines.
|
|
||||||
Of course a paste can have multiple hashed lines and an hashed line can be
|
|
||||||
contained in multiple paste.
|
|
||||||
In this case it's a common hash.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if (r_serv.scard(h) >= mincard) and (r_serv.scard(h) <= maxcard):
|
|
||||||
|
|
||||||
for filename in r_serv.smembers(h):
|
|
||||||
|
|
||||||
for line in r_serv.smembers(filename):
|
|
||||||
|
|
||||||
line = line.decode('UTF-8', errors='ignore')
|
|
||||||
line = xlm.quoteattr(line, {'"':'"', "'":"'"})
|
|
||||||
|
|
||||||
graph.add_edge(h, line+" -- "+filename)
|
|
||||||
|
|
||||||
#OK
|
|
|
@ -1,151 +0,0 @@
|
||||||
import redis, time, sys, os, inspect
|
|
||||||
|
|
||||||
from datetime import timedelta, date, datetime
|
|
||||||
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
def set_listof_pid(r_serv, filename, name):
|
|
||||||
"""Create the pid list and it's pid members
|
|
||||||
|
|
||||||
:param r_serv: -- Connexion to redis.
|
|
||||||
:param filename: -- the absolute pastes path name.
|
|
||||||
:param name: -- the traditionnal argv[0] (The name of the launched script)
|
|
||||||
|
|
||||||
This function create a hashes in redis as follows and a set of pid.
|
|
||||||
|
|
||||||
+------------+------------+---------------------+
|
|
||||||
| Keys | Fields | Values |
|
|
||||||
+============+============+=====================+
|
|
||||||
| 2045 | startime | 2014-05-09_11:44:17 |
|
|
||||||
+------------+------------+---------------------+
|
|
||||||
| ... | prog | ./programme |
|
|
||||||
+------------+------------+---------------------+
|
|
||||||
| ... | pid | 2045 |
|
|
||||||
+------------+------------+---------------------+
|
|
||||||
| ... | paste | /home/folder/aux.gz |
|
|
||||||
+------------+------------+---------------------+
|
|
||||||
| ... | kb | 54.12 |
|
|
||||||
+------------+------------+---------------------+
|
|
||||||
|
|
||||||
+------------+------------+
|
|
||||||
| Keys | Members |
|
|
||||||
+============+============+
|
|
||||||
| pid | 2045 |
|
|
||||||
+------------+------------+
|
|
||||||
| ... | 2480 |
|
|
||||||
+------------+------------+
|
|
||||||
|
|
||||||
"""
|
|
||||||
r_serv.sadd("pid", os.getpid())
|
|
||||||
r_serv.hmset(os.getpid(),
|
|
||||||
{
|
|
||||||
"startime":time.strftime("%Y-%m-%d_%H:%M:%S"),
|
|
||||||
"prog":name,
|
|
||||||
"pid":str(os.getpid()),
|
|
||||||
"paste":filename,
|
|
||||||
"Kb":round(os.path.getsize(filename)/1024.0,2)
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_listof_pid(r_serv):
|
|
||||||
"""Remove pid from the pid list
|
|
||||||
|
|
||||||
:param r_serv: -- Connexion to redis.
|
|
||||||
|
|
||||||
Remove from the list and redis, pid which are terminated.
|
|
||||||
|
|
||||||
"""
|
|
||||||
r_serv.srem("pid", os.getpid())
|
|
||||||
r_serv.delete(os.getpid())
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def flush_list_of_pid(r_serv):
|
|
||||||
"""Flush the datas in redis
|
|
||||||
|
|
||||||
:param r_serv: -- Connexion to redis.
|
|
||||||
|
|
||||||
Clean the redis database from the previous pid and pidlist inserted
|
|
||||||
|
|
||||||
"""
|
|
||||||
for x in r_serv.smembers("pid"):
|
|
||||||
r_serv.delete(x)
|
|
||||||
|
|
||||||
r_serv.delete("pid")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def format_display_listof_pid(dico, arg):
|
|
||||||
"""Formating data for shell and human
|
|
||||||
|
|
||||||
:param dico: (dict) dictionnary
|
|
||||||
:param arg: (str) Choosing argument
|
|
||||||
|
|
||||||
:returns: (str)
|
|
||||||
|
|
||||||
This function provide different displaying formats for the dictionnary's data.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if arg == 'pid':
|
|
||||||
var = "{0}".format(dico['pid'])
|
|
||||||
elif arg == 'up':
|
|
||||||
var = "{0}".format(dico['uptime'])
|
|
||||||
elif arg == 'kb':
|
|
||||||
var = "{0}".format(dico['Kb'])
|
|
||||||
elif arg == 'paste':
|
|
||||||
var = "{0}".format(dico['paste'])
|
|
||||||
elif arg == 'startime':
|
|
||||||
var = "{0}".format(dico['startime'])
|
|
||||||
elif arg == 'prg':
|
|
||||||
var = "{0}".format(dico['prog'])
|
|
||||||
else:
|
|
||||||
var = "PID:{0},uptime:{1},kb:{2},paste:{3},prog:{4},startime:{5}".format(dico['pid'],
|
|
||||||
dico['uptime'],
|
|
||||||
dico['Kb'],
|
|
||||||
dico['paste'],
|
|
||||||
dico['prog'],
|
|
||||||
dico['startime'])
|
|
||||||
|
|
||||||
return var
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def display_listof_pid(r_serv, arg):
|
|
||||||
"""Display the pid list from redis
|
|
||||||
|
|
||||||
This function display infos in the shell about lauched process
|
|
||||||
|
|
||||||
"""
|
|
||||||
jobs = {}
|
|
||||||
joblist = []
|
|
||||||
try:
|
|
||||||
for job in r_serv.smembers("pid"):
|
|
||||||
jobs = r_serv.hgetall(job)
|
|
||||||
|
|
||||||
if jobs != None:
|
|
||||||
start = datetime.strptime(r_serv.hget(job, "startime"), "%Y-%m-%d_%H:%M:%S")
|
|
||||||
|
|
||||||
end = datetime.strptime(time.strftime("%Y-%m-%d_%H:%M:%S"), "%Y-%m-%d_%H:%M:%S")
|
|
||||||
jobs['uptime'] = str(abs(start - end))
|
|
||||||
joblist.append(jobs)
|
|
||||||
else:
|
|
||||||
publisher.debug("display_list_of_pid Aborted due to lack of Information in Redis")
|
|
||||||
|
|
||||||
joblist = sorted(joblist, key=lambda k: k['uptime'], reverse=True)
|
|
||||||
|
|
||||||
for job in joblist:
|
|
||||||
print format_display_listof_pid(job, arg)
|
|
||||||
|
|
||||||
if arg == "remain":
|
|
||||||
print "Remaining: {0}".format(r_serv.llen("filelist"))
|
|
||||||
|
|
||||||
if arg == "processed":
|
|
||||||
print "processed: {0}".format(r_serv.llen("processed"))
|
|
||||||
|
|
||||||
except TypeError:
|
|
||||||
publisher.error("TypeError for display_listof_pid")
|
|
|
@ -1,203 +0,0 @@
|
||||||
import sys, hashlib, os, os.path, gzip, string, glob, itertools, copy, shutil
|
|
||||||
import redis, crcmod, mmh3, time, fileinput
|
|
||||||
import crcmod, mmh3
|
|
||||||
|
|
||||||
from operator import itemgetter, attrgetter
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def listdirectory(path):
|
|
||||||
"""Path Traversing Function.
|
|
||||||
|
|
||||||
:param path: -- The absolute pathname to a directory.
|
|
||||||
|
|
||||||
This function is returning all the absolute path of the files contained in
|
|
||||||
the argument directory.
|
|
||||||
|
|
||||||
"""
|
|
||||||
fichier=[]
|
|
||||||
for root, dirs, files in os.walk(path):
|
|
||||||
|
|
||||||
for i in files:
|
|
||||||
|
|
||||||
fichier.append(os.path.join(root, i))
|
|
||||||
|
|
||||||
return fichier
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
|
||||||
"""It filters out non-printable characters from the string it receives."""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def select_hash(hashkind, line):
|
|
||||||
"""Select the kind of hashing for the line.
|
|
||||||
|
|
||||||
:param hashkind: -- (str) The name of the hash
|
|
||||||
:param line: -- (str) The string to hash.
|
|
||||||
|
|
||||||
This function is a kind of hash selector which will use the hash passed
|
|
||||||
in argument to hash the string also passed in argument.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if hashkind == "md5":
|
|
||||||
hashline = hashlib.md5(line).hexdigest()
|
|
||||||
|
|
||||||
elif hashkind == "sha1":
|
|
||||||
hashline = hashlib.sha1(line).hexdigest()
|
|
||||||
|
|
||||||
elif hashkind == "crc":
|
|
||||||
crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
|
|
||||||
crc32.update(line)
|
|
||||||
hashline = crc32.hexdigest()
|
|
||||||
|
|
||||||
elif hashkind == "murmur":
|
|
||||||
hashline = mmh3.hash(line)
|
|
||||||
|
|
||||||
return str(hashline)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def redis_populate(pipe, folder, minline, hashkind, jmp, insert_type):
|
|
||||||
"""Call another function with different "mode"
|
|
||||||
|
|
||||||
:param pipe: -- Redis pipe
|
|
||||||
:param folder: -- the absolute path name to the folder where to process
|
|
||||||
:param minline: -- the minimum lenght of line to hash
|
|
||||||
:param hashkind: -- the hash to use
|
|
||||||
:param jmp: -- (bool) trigger the jumping line mode or not
|
|
||||||
:param insert_type: -- which kind of datastructure to create in redis.
|
|
||||||
|
|
||||||
This Function actually call the function "insert_redis" with differents
|
|
||||||
method to process it.
|
|
||||||
In one way, x lines are jumped before the Insertion.
|
|
||||||
In another, all the line are hashed and inserted in redis.
|
|
||||||
|
|
||||||
"""
|
|
||||||
for filename in folder:
|
|
||||||
|
|
||||||
with gzip.open(filename, 'rb') as F:
|
|
||||||
start_line = 1
|
|
||||||
|
|
||||||
for num, line in enumerate(F, start_line):
|
|
||||||
|
|
||||||
if jmp != 1:
|
|
||||||
|
|
||||||
if (num % jmp) == 1 :
|
|
||||||
insert_redis(filename,
|
|
||||||
line,
|
|
||||||
pipe,
|
|
||||||
minline,
|
|
||||||
hashkind,
|
|
||||||
num,
|
|
||||||
insert_type)
|
|
||||||
|
|
||||||
else:
|
|
||||||
insert_redis(filename,
|
|
||||||
line,
|
|
||||||
pipe,
|
|
||||||
minline,
|
|
||||||
hashkind,
|
|
||||||
num,
|
|
||||||
insert_type)
|
|
||||||
|
|
||||||
pipe.execute()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def insert_redis(filename, line, pipe, minline, hashkind, num, insert_type):
|
|
||||||
"""Insert hashed line in redis.
|
|
||||||
|
|
||||||
:param filename: -- the absolute path name to the folder where to process
|
|
||||||
:param line: -- the clear line which will be hashed.
|
|
||||||
:param pipe: -- Redis pipe
|
|
||||||
:param minline: -- the minimum lenght of line to hash
|
|
||||||
:param hashkind: -- the hash to use
|
|
||||||
:param num: -- (int) the first line of the file (better human read)
|
|
||||||
:param insert_type: -- (int) Choose the datastructure used in redis.
|
|
||||||
|
|
||||||
This function insert hashed lines in the selected redis datastructure
|
|
||||||
The datastructure is represented as follow:
|
|
||||||
|
|
||||||
case one: ALLIN
|
|
||||||
"hash"[hashedline][occurence] => to index all different hashs + scoring
|
|
||||||
"hashedline"[filename.gz] => to associate the file.gz to his hashedline
|
|
||||||
"L:hashedline"[clearline] => for the correspondance
|
|
||||||
|
|
||||||
case two: SORTED SET (for the ./top.py script)
|
|
||||||
"hash"[hashedline][occurence] => to index all different hashs + scoring
|
|
||||||
"hashedline"[filename.gz] => to associate the file.gz to his hashedline
|
|
||||||
|
|
||||||
case tree: BASIC SET (for ./Graph.py)
|
|
||||||
"hash"[hashedline] to index all different hashs (without scores)
|
|
||||||
"hashedline"[filename.gz] => to associate the file.gz to his hashedline
|
|
||||||
"filename.gz"[firstline] => for human reading
|
|
||||||
|
|
||||||
"""
|
|
||||||
if (insert_type == 2): # ALLIN
|
|
||||||
if len(line) >= minline:
|
|
||||||
|
|
||||||
pipe.zincrby("hash", select_hash(hashkind, line), 1)
|
|
||||||
pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
|
|
||||||
pipe.sadd("L:"+select_hash(hashkind, line), clean(line))
|
|
||||||
|
|
||||||
if (num == 1):
|
|
||||||
|
|
||||||
pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
|
|
||||||
|
|
||||||
|
|
||||||
elif (insert_type == 1): # SORTED SET FOR TOP100.py
|
|
||||||
|
|
||||||
if len(line) >= minline:
|
|
||||||
|
|
||||||
pipe.zincrby("hash", select_hash(hashkind, line), 1)
|
|
||||||
pipe.sadd(select_hash(hashkind, line), clean(line))
|
|
||||||
|
|
||||||
|
|
||||||
elif (insert_type == 0): # SET FOR THE GRAPH
|
|
||||||
|
|
||||||
if len(line) >= minline:
|
|
||||||
|
|
||||||
pipe.sadd("hash", select_hash(hashkind, line))
|
|
||||||
pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
|
|
||||||
|
|
||||||
if (num == 1):
|
|
||||||
|
|
||||||
pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def remove_pure_doppelganger(r_serv, nb):
|
|
||||||
"""Remove identic paste
|
|
||||||
|
|
||||||
:param r_serv: -- Redis connexion database
|
|
||||||
:param nb: -- (int) Number of execution wanted
|
|
||||||
|
|
||||||
Add to a temporary list the hash of wholes files and compare the new hash
|
|
||||||
to the element of this list. If the hash is already inside, the file
|
|
||||||
is deleted otherwise the hash is added in the list.
|
|
||||||
|
|
||||||
"""
|
|
||||||
hashlist = []
|
|
||||||
for x in xrange(0,nb):
|
|
||||||
filename = r_serv.lpop("filelist")
|
|
||||||
|
|
||||||
with open(filename, 'rb') as L:
|
|
||||||
hashline = hashlib.md5(L.read()).hexdigest()
|
|
||||||
|
|
||||||
print len(hashlist)
|
|
||||||
|
|
||||||
if hashline in hashlist:
|
|
||||||
|
|
||||||
os.remove(filename)
|
|
||||||
publisher.debug("{0} removed".format(filename))
|
|
||||||
print "{0} removed".format(filename)
|
|
||||||
else:
|
|
||||||
hashlist.append(hashline)
|
|
|
@ -15,32 +15,6 @@ from datetime import date, timedelta
|
||||||
from dateutil.rrule import rrule, DAILY
|
from dateutil.rrule import rrule, DAILY
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_graph_by_day_datastruct(r_serv, r_key, year, month):
|
|
||||||
"""Creating a datastructure in redis.
|
|
||||||
|
|
||||||
:param r_serv: -- Redis connexion database
|
|
||||||
:param r_key: -- (str) The name of the key read in redis (often the name of
|
|
||||||
the keywords category list)
|
|
||||||
:param year: -- (integer) The year to process
|
|
||||||
:param month: -- (integer) The month to process
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
a = date(year, month, 01)
|
|
||||||
b = date(year, month, cal.monthrange(year, month)[1])
|
|
||||||
|
|
||||||
for dt in rrule(DAILY, dtstart = a, until = b):
|
|
||||||
r_serv.zadd(r_key+'_by_day',0,dt.strftime("%Y%m%d"))
|
|
||||||
|
|
||||||
for Tfilename in r_serv.zrange(r_key+'_occur', 0, -1, withscores = True):
|
|
||||||
r_serv.zincrby(r_key+'_by_day',
|
|
||||||
Tfilename[0][-22:-12].replace('/',''),
|
|
||||||
Tfilename[1])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def is_luhn_valid(card_number):
|
def is_luhn_valid(card_number):
|
||||||
"""Apply the Luhn algorithm to validate credit card.
|
"""Apply the Luhn algorithm to validate credit card.
|
||||||
|
|
||||||
|
@ -156,155 +130,3 @@ def checking_A_record(r_serv, domains_set):
|
||||||
|
|
||||||
publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score))
|
publisher.debug("URLs before: {0} after: {1} (valid)".format(num, score))
|
||||||
return (num, WalidA)
|
return (num, WalidA)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def refining_regex_dataset(r_serv, r_key, regex, min_match, year, month, luhn = True, dnscheck = True):
|
|
||||||
"""Refine the "raw dataset" of paste with regulars expressions
|
|
||||||
|
|
||||||
:param r_serv: -- Redis connexion database
|
|
||||||
:param r_key: -- (str) The name of the key read in redis (often the name of
|
|
||||||
the keywords category list)
|
|
||||||
:param min_match: -- (int) Below this number file are deleted
|
|
||||||
:param regex: -- Regular expression which will be match.
|
|
||||||
|
|
||||||
This function Refine database created with classify_token_paste function.
|
|
||||||
It opening again the files which matchs the keywords category list, found
|
|
||||||
regular expression inside it and count how many time is found.
|
|
||||||
|
|
||||||
If there is not too much match about the regular expression the file is
|
|
||||||
deleted from the list.
|
|
||||||
|
|
||||||
Than it finally merge the result by day to be able to create a bar graph
|
|
||||||
which will represent how many occurence by day the regex match.
|
|
||||||
|
|
||||||
"""
|
|
||||||
for filename in r_serv.zrange(r_key, 0, -1):
|
|
||||||
|
|
||||||
with gzip.open(filename, 'rb') as F:
|
|
||||||
var = 0
|
|
||||||
matchs = set([])
|
|
||||||
|
|
||||||
for num, kword in enumerate(F):
|
|
||||||
|
|
||||||
match = re.findall(regex, kword)
|
|
||||||
var += len(match)
|
|
||||||
|
|
||||||
for y in match:
|
|
||||||
if y != '' and len(y) < 100:
|
|
||||||
matchs.add(y)
|
|
||||||
# If there is less match than min_match delete it (False pos)
|
|
||||||
if len(matchs) <= min_match :
|
|
||||||
r_serv.zrem(r_key, filename)
|
|
||||||
publisher.debug("{0} deleted".format(filename))
|
|
||||||
else:
|
|
||||||
# else changing the score.
|
|
||||||
if r_key == "creditcard_categ" and luhn:
|
|
||||||
for card_number in matchs:
|
|
||||||
if is_luhn_valid(card_number):
|
|
||||||
|
|
||||||
r_serv.zincrby(r_key+'_occur', filename, 1)
|
|
||||||
|
|
||||||
publisher.info("{1} is valid in the file {0}".format(filename, card_number))
|
|
||||||
else:
|
|
||||||
publisher.debug("{0} card is invalid".format(card_number))
|
|
||||||
|
|
||||||
if r_key == "mails_categ" and dnscheck:
|
|
||||||
r_serv.zadd(r_key+'_occur', checking_MX_record(r_serv, matchs), filename)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# LUHN NOT TRIGGERED (Other Categs)
|
|
||||||
r_serv.zadd(r_key+'_occur',
|
|
||||||
len(matchs),
|
|
||||||
filename)
|
|
||||||
|
|
||||||
create_graph_by_day_datastruct(r_serv, r_key, year, month)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def graph_categ_by_day(r_serv, filename, year, month, r_key):
|
|
||||||
"""Create a bargraph representing regex matching by day
|
|
||||||
|
|
||||||
:param r_serv: -- Redis connexion database
|
|
||||||
:param filename: -- (str) The absolute path where to save the figure.png
|
|
||||||
:param r_key: -- (str) The name of the key read in redis (often the name of
|
|
||||||
the keywords category list)
|
|
||||||
:param year: -- (integer) The year to process
|
|
||||||
:param month: -- (integer) The month to process
|
|
||||||
|
|
||||||
This function display the amount of the category per day.
|
|
||||||
|
|
||||||
"""
|
|
||||||
adate = []
|
|
||||||
categ_num = []
|
|
||||||
rcParams['figure.figsize'] = 15, 10
|
|
||||||
|
|
||||||
a = date(year, month, 01)
|
|
||||||
b = date(year, month, cal.monthrange(year, month)[1])
|
|
||||||
|
|
||||||
for dt in rrule(DAILY, dtstart = a, until = b):
|
|
||||||
adate.append(dt.strftime("%d"))
|
|
||||||
categ_num.append(r_serv.zscore(r_key+'_by_day',dt.strftime("%Y%m%d")))
|
|
||||||
|
|
||||||
n_groups = len(categ_num)
|
|
||||||
adress_scores = tuple(categ_num)
|
|
||||||
|
|
||||||
index = np.arange(n_groups)
|
|
||||||
bar_width = 0.5
|
|
||||||
opacity = 0.6
|
|
||||||
|
|
||||||
ladress = plt.bar(index, adress_scores, bar_width,
|
|
||||||
alpha = opacity,
|
|
||||||
color = 'b',
|
|
||||||
label = r_key)
|
|
||||||
|
|
||||||
|
|
||||||
plt.plot(tuple(categ_num), 'r--')
|
|
||||||
#plt.yscale('log')
|
|
||||||
plt.xlabel('Days')
|
|
||||||
plt.ylabel('Amount')
|
|
||||||
plt.title('Occurence of '+r_key+' by day')
|
|
||||||
plt.xticks(index + bar_width/2 , tuple(adate))
|
|
||||||
|
|
||||||
plt.legend()
|
|
||||||
plt.grid()
|
|
||||||
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
|
|
||||||
orientation='portrait', papertype=None, format="png",
|
|
||||||
transparent=False, bbox_inches=None, pad_inches=0.1,
|
|
||||||
frameon=True)
|
|
||||||
|
|
||||||
publisher.info(filename+".png"+" saved!")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_tld_list(url = "https://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"):
|
|
||||||
"""Recover a tld list from url.
|
|
||||||
|
|
||||||
:param url: -- The url of the tld list.
|
|
||||||
:return: -- list
|
|
||||||
|
|
||||||
This function recover from mozilla.org the list of the effective tld names,
|
|
||||||
Save it as a file, and return a list of all the tld.
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
domains = []
|
|
||||||
htmlSource = urllib.urlopen(url).read()
|
|
||||||
with open("ICCANdomain", 'wb') as F:
|
|
||||||
F.write(htmlSource)
|
|
||||||
|
|
||||||
with open("ICCANdomain", 'rb') as F:
|
|
||||||
|
|
||||||
for num, line in enumerate(F):
|
|
||||||
if re.match(r"^\/\/|\n", line) == None:
|
|
||||||
domains.append(re.sub(r'\*', '', line[:-1]))
|
|
||||||
else:
|
|
||||||
publisher.info("Comment line ignored.")
|
|
||||||
|
|
||||||
return domains
|
|
||||||
|
|
|
@ -1,103 +0,0 @@
|
||||||
import redis
|
|
||||||
import string
|
|
||||||
|
|
||||||
|
|
||||||
def create_common_hash_file(r_serv, zmin, zmax, filename):
|
|
||||||
""" Create a "top100".txt file.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param zmin: -- (int) Offset of the top list
|
|
||||||
:param zmax: -- (int) Number of element wanted to be in the top list.
|
|
||||||
:param filename: -- the pathname to the created file.
|
|
||||||
|
|
||||||
This Function create a ranking list between zmin and zman of the most common
|
|
||||||
hashs.
|
|
||||||
Line are written as follow in the file:
|
|
||||||
hash:[md5hash]:[cardinality]:[line]
|
|
||||||
All hashes represent a full line which mean it can be one char or more...
|
|
||||||
|
|
||||||
"""
|
|
||||||
with open(filename, 'wb') as F:
|
|
||||||
|
|
||||||
for h, num in r_serv.zrevrangebyscore("hash", "+inf", "-inf", zmin, zmax, True):
|
|
||||||
|
|
||||||
F.write("hash:{0}:{1}:{2}\n".format(h, num, list(r_serv.smembers('L:'+h))))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def paste_searching(r_serv, filename, pastename, mincard, maxcard):
|
|
||||||
"""Search similar hashs from a given file.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param filename: -- the pathname to the created file.
|
|
||||||
:param pastename: -- the name of the paste used to search in redis database.
|
|
||||||
:param mincard: -- the minimum occurence needed of an hash to be taken in count.
|
|
||||||
:param maxcard: -- the maximum occurence needed of an hash to be taken in count.
|
|
||||||
|
|
||||||
This function return a text file which is a kind of synthesis about
|
|
||||||
where (in the others pastes) the hash of the given pastename have been found.
|
|
||||||
|
|
||||||
"""
|
|
||||||
P = set([pastename])
|
|
||||||
tmp_h = str()
|
|
||||||
tmp_set = set([])
|
|
||||||
|
|
||||||
with open(filename, 'wb') as F:
|
|
||||||
|
|
||||||
F.write("Paste: {0}\nOptions used:\nMincard: {1}\nMaxcard: {2}\n\nContaining Following Hash:\n".format(pastename,mincard,maxcard))
|
|
||||||
|
|
||||||
for h in r_serv.smembers("hash"):
|
|
||||||
|
|
||||||
if (r_serv.smembers(h).intersection(P) and r_serv.scard(h) >= mincard and r_serv.scard(h) <= maxcard):
|
|
||||||
|
|
||||||
F.write(h+'\n')
|
|
||||||
tmp_set = tmp_set.union(r_serv.smembers(h).union(r_serv.smembers(tmp_h)))
|
|
||||||
|
|
||||||
tmp_h = h
|
|
||||||
|
|
||||||
F.write("\nSimilar Files:\n")
|
|
||||||
|
|
||||||
for n, s in enumerate(tmp_set):
|
|
||||||
|
|
||||||
F.write(str(n) + ': ' + s + '\n')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def paste_searching2(r_serv, filename, pastename, mincard, maxcard):
|
|
||||||
"""Search similar hashs from a given file.
|
|
||||||
(On another kind of redis data structure)
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param filename: -- the pathname to the created file.
|
|
||||||
:param pastename: -- the name of the paste used to search in redis database.
|
|
||||||
:param mincard: -- the minimum occurence needed of an hash to be taken in count.
|
|
||||||
:param maxcard: -- the maximum occurence needed of an hash to be taken in count.
|
|
||||||
|
|
||||||
This function return a text file which is a kind of synthesis about
|
|
||||||
where (in the others pastes) the hash of the given pastename have been found.
|
|
||||||
|
|
||||||
"""
|
|
||||||
P = set([pastename])
|
|
||||||
tmp_h = str()
|
|
||||||
tmp_set = set([])
|
|
||||||
|
|
||||||
with open(filename, 'wb') as F:
|
|
||||||
|
|
||||||
F.write("Paste: {0}\nOptions used:\nMincard: {1}\nMaxcard: {2}\n\n###Containing Following Hash:### ###Occur### ###### Corresponding Line ######\n".format(pastename,mincard,maxcard))
|
|
||||||
|
|
||||||
for h in r_serv.zrange("hash", 0, -1):
|
|
||||||
|
|
||||||
if (r_serv.smembers(h).intersection(P) and r_serv.scard(h) >= mincard and r_serv.scard(h) <= maxcard):
|
|
||||||
|
|
||||||
F.write(h + ' -- ' + str(r_serv.zscore("hash",h)) + ' -- ' + str(list(r_serv.smembers('L:' + h))) + '\n')
|
|
||||||
tmp_set = tmp_set.union(r_serv.smembers(h).union(r_serv.smembers(tmp_h)))
|
|
||||||
|
|
||||||
tmp_h = h
|
|
||||||
|
|
||||||
F.write("\nSimilar Files:\n")
|
|
||||||
|
|
||||||
for n, s in enumerate(tmp_set):
|
|
||||||
|
|
||||||
F.write(str(n) + ': ' + s + '\n')
|
|
|
@ -19,316 +19,30 @@ from dateutil.rrule import rrule, DAILY
|
||||||
|
|
||||||
from packages import *
|
from packages import *
|
||||||
|
|
||||||
def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength):
|
|
||||||
"""Looping function
|
|
||||||
|
|
||||||
:param pipe: -- Redis pipe.
|
def listdirectory(path):
|
||||||
:param nb: -- (int) Number of pastes proceeded by function
|
"""Path Traversing Function.
|
||||||
:param minlength: -- (int) passed to the next function
|
|
||||||
:param maxlength: -- (int) passed to the next function
|
:param path: -- The absolute pathname to a directory.
|
||||||
|
|
||||||
|
This function is returning all the absolute path of the files contained in
|
||||||
|
the argument directory.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
fichier=[]
|
||||||
for n in xrange(0,nb):
|
for root, dirs, files in os.walk(path):
|
||||||
|
|
||||||
path = r_serv.lpop("filelist")
|
for i in files:
|
||||||
|
|
||||||
if path != None:
|
fichier.append(os.path.join(root, i))
|
||||||
set_listof_pid(r_serv, path, sys.argv[0])
|
|
||||||
|
|
||||||
redis_zincr_words(pipe, path, minlength, maxlength)
|
return fichier
|
||||||
|
|
||||||
update_listof_pid(r_serv)
|
|
||||||
|
|
||||||
r_serv.lpush("processed",path)
|
|
||||||
|
|
||||||
publisher.debug(path)
|
|
||||||
else:
|
|
||||||
publisher.debug("Empty list")
|
|
||||||
break
|
|
||||||
except (KeyboardInterrupt, SystemExit) as e:
|
|
||||||
flush_list_of_pid(r_serv)
|
|
||||||
publisher.debug("Pid list flushed")
|
|
||||||
|
|
||||||
|
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
||||||
|
"""It filters out non-printable characters from the string it receives."""
|
||||||
|
|
||||||
|
|
||||||
def redis_zincr_words(pipe, filename, minlength, maxlength):
|
|
||||||
"""Create news sorted set in redis.
|
|
||||||
|
|
||||||
:param minlength: -- (int) Minimum words length inserted
|
|
||||||
:param maxlength: -- (int) Maximum words length inserted
|
|
||||||
:param filename: -- The absolute path to the file.gz to process.
|
|
||||||
|
|
||||||
Representation of the set in redis:
|
|
||||||
|
|
||||||
+------------+------------+-----------+
|
|
||||||
| Keys | Members | Scores |
|
|
||||||
+============+============+===========+
|
|
||||||
| 20131001 | word1 | 142 |
|
|
||||||
+------------+------------+-----------+
|
|
||||||
| ... | word2 | 120 |
|
|
||||||
+------------+------------+-----------+
|
|
||||||
| 20131002 | ... | ... |
|
|
||||||
+------------+------------+-----------+
|
|
||||||
|
|
||||||
This function store all words between minlength and maxlength in redis.
|
|
||||||
Redis will count as well how much time each word will appear by day:
|
|
||||||
The cardinality.
|
|
||||||
|
|
||||||
"""
|
|
||||||
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
|
|
||||||
|
|
||||||
with gzip.open(filename, 'rb') as F:
|
|
||||||
|
|
||||||
blob = TextBlob(clean(F.read()), tokenizer = tokenizer)
|
|
||||||
|
|
||||||
for word in blob.tokens:
|
|
||||||
|
|
||||||
if (len(word) >= minlength) and (len(word) <= maxlength):
|
|
||||||
pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)
|
|
||||||
|
|
||||||
if (len(word) >= maxlength):
|
|
||||||
publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))
|
|
||||||
publisher.info(word)
|
|
||||||
|
|
||||||
pipe.execute()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):
|
|
||||||
"""Tokenizing on word category
|
|
||||||
|
|
||||||
:param r_serv: -- Redis database connexion
|
|
||||||
:param listname: -- (str) path to the file containing the list of path of category files
|
|
||||||
:param choicedatastruct: -- (bool) Changing the index of datastructure
|
|
||||||
:param nb: -- (int) Number of pastes proceeded by function
|
|
||||||
|
|
||||||
Redis data structures cas be choose as follow:
|
|
||||||
|
|
||||||
+---------------+------------+-----------+
|
|
||||||
| Keys | Members | Scores |
|
|
||||||
+===============+============+===========+
|
|
||||||
| mails_categ | filename | 25000 |
|
|
||||||
+---------------+------------+-----------+
|
|
||||||
| ... | filename2 | 2400 |
|
|
||||||
+---------------+------------+-----------+
|
|
||||||
| web_categ | ... | ... |
|
|
||||||
+---------------+------------+-----------+
|
|
||||||
|
|
||||||
Or
|
|
||||||
|
|
||||||
+--------------+-------------+-----------+
|
|
||||||
| Keys | Members | Scores |
|
|
||||||
+==============+=============+===========+
|
|
||||||
| filename | mails_categ | 100000 |
|
|
||||||
+--------------+-------------+-----------+
|
|
||||||
| ... | web_categ | 24050 |
|
|
||||||
+--------------+-------------+-----------+
|
|
||||||
| filename2 | ... | ... |
|
|
||||||
+--------------+-------------+-----------+
|
|
||||||
|
|
||||||
This function tokenise on all special characters like: @^\|[{#~}]!:;$^=
|
|
||||||
And insert data in redis if the token match the keywords in a list previously
|
|
||||||
created.
|
|
||||||
These lists of keywords can be list of everything you want but it's better
|
|
||||||
to create "category" of keywords.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
for n in xrange(0,nb):
|
|
||||||
filename = r_serv.lpop(r_set)
|
|
||||||
|
|
||||||
if filename != None:
|
|
||||||
|
|
||||||
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
|
|
||||||
set_listof_pid(r_serv, filename, sys.argv[0])
|
|
||||||
|
|
||||||
with open(listname, 'rb') as L:
|
|
||||||
# for each "categ" listed in the file
|
|
||||||
for num, fname in enumerate(L):
|
|
||||||
# contain keywords by categ
|
|
||||||
tmp_list = []
|
|
||||||
#for each keywords
|
|
||||||
with open(fname[:-1], 'rb') as LS:
|
|
||||||
|
|
||||||
for num, kword in enumerate(LS):
|
|
||||||
tmp_list.append(kword[:-1])
|
|
||||||
|
|
||||||
# for each paste
|
|
||||||
with gzip.open(filename, 'rb') as F:
|
|
||||||
|
|
||||||
blob = TextBlob(clean(F.read()),
|
|
||||||
tokenizer = tokenizer)
|
|
||||||
|
|
||||||
# for each paste token
|
|
||||||
for word in blob.tokens.lower():
|
|
||||||
|
|
||||||
if word in tmp_list:
|
|
||||||
# choosing between two data structures.
|
|
||||||
if choicedatastruct:
|
|
||||||
r_serv.zincrby(filename,
|
|
||||||
fname.split('/')[-1][:-1],
|
|
||||||
1)
|
|
||||||
else:
|
|
||||||
r_serv.zincrby(fname.split('/')[-1][:-1],
|
|
||||||
filename,
|
|
||||||
1)
|
|
||||||
|
|
||||||
update_listof_pid(r_serv)
|
|
||||||
|
|
||||||
else:
|
|
||||||
publisher.debug("Empty list")
|
|
||||||
#r_serv.save()
|
|
||||||
break
|
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit) as e:
|
|
||||||
flush_list_of_pid(r_serv)
|
|
||||||
publisher.debug("Pid list flushed")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def dectect_longlines(r_serv, r_key, store = False, maxlength = 500):
|
|
||||||
"""Store longlines's linenumbers in redis
|
|
||||||
|
|
||||||
:param r_serv: -- The redis connexion database
|
|
||||||
:param r_key: -- (str) The key name in redis
|
|
||||||
:param store: -- (bool) Store the line numbers or not.
|
|
||||||
:param maxlength: -- The limit between "short lines" and "long lines"
|
|
||||||
|
|
||||||
This function connect to a redis list of filename (pastes filename);
|
|
||||||
Open the paste and check inside if there is some line with their
|
|
||||||
length >= to maxlength.
|
|
||||||
If yes, the paste is "tagged" as containing a longlines in another
|
|
||||||
redis structures, and the linenumber (of the long lines) can be stored
|
|
||||||
in addition if the argument store is at True.
|
|
||||||
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
#r_key_list (categ)
|
|
||||||
filename = r_serv.lpop(r_key)
|
|
||||||
|
|
||||||
if filename != None:
|
|
||||||
|
|
||||||
set_listof_pid(r_serv, filename, sys.argv[0])
|
|
||||||
|
|
||||||
# for each pastes
|
|
||||||
with gzip.open(filename, 'rb') as F:
|
|
||||||
var = True
|
|
||||||
for num, line in enumerate(F):
|
|
||||||
|
|
||||||
if len(line) >= maxlength:
|
|
||||||
#publisher.debug("Longline:{0}".format(line))
|
|
||||||
if var:
|
|
||||||
r_serv.rpush("longlines", filename)
|
|
||||||
var = False
|
|
||||||
|
|
||||||
if store:
|
|
||||||
r_serv.sadd(filename, num)
|
|
||||||
else:
|
|
||||||
publisher.debug("Line numbers of longlines not stored")
|
|
||||||
|
|
||||||
update_listof_pid(r_serv)
|
|
||||||
else:
|
|
||||||
publisher.debug("Empty list")
|
|
||||||
return False
|
|
||||||
break
|
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit) as e:
|
|
||||||
flush_list_of_pid(r_serv)
|
|
||||||
publisher.debug("Pid list flushed")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# NOT USED RIGHT NOW #
|
|
||||||
def recovering_longlines(r_serv):
|
|
||||||
"""Get longlines with linenumbers
|
|
||||||
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
for n in xrange(0,nb):
|
|
||||||
filename = r_serv.lpop("longlines")
|
|
||||||
|
|
||||||
if filename != None:
|
|
||||||
# For each values in redis (longline's line number)
|
|
||||||
for numline in r_serv.smembers(filename):
|
|
||||||
|
|
||||||
with gzip.open(filename,'rb') as F:
|
|
||||||
|
|
||||||
for num, line in enumerate(F):
|
|
||||||
#When corresponding.
|
|
||||||
if int(num) == int(numline):
|
|
||||||
pass
|
|
||||||
# TREATMENT
|
|
||||||
else:
|
|
||||||
publisher.debug("Empty list")
|
|
||||||
r_serv.save()
|
|
||||||
break
|
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit) as e:
|
|
||||||
flush_list_of_pid(r_serv)
|
|
||||||
publisher.debug("Pid list flushed")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength):
|
|
||||||
"""Remove from a set, file with long lines.
|
|
||||||
|
|
||||||
:param r_serv: -- The redis connexion database
|
|
||||||
:param r_key: -- (str) The key name in redis
|
|
||||||
:param store: -- (bool) Store the line numbers or not.
|
|
||||||
:param delete: -- (bool) If true, delete the used key from redis.
|
|
||||||
:param maxlength: -- The limit between "short lines" and "long lines"
|
|
||||||
|
|
||||||
"""
|
|
||||||
publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key)))
|
|
||||||
|
|
||||||
#Create a list of file to proceed (1)
|
|
||||||
for filename in r_serv.zrange(r_key, 0, -1):
|
|
||||||
r_serv.rpush(r_key+"_list", filename)
|
|
||||||
|
|
||||||
#detecting longlines in pastes
|
|
||||||
dectect_longlines(r_serv, r_key+"_list", store, maxlength)
|
|
||||||
|
|
||||||
#remove false positive members
|
|
||||||
while True:
|
|
||||||
fp_filename = r_serv.lpop("longlines")
|
|
||||||
|
|
||||||
if fp_filename == None:
|
|
||||||
break
|
|
||||||
|
|
||||||
else:
|
|
||||||
# if wanted, delete in addition the set with linenumbers (created with store)
|
|
||||||
if delete:
|
|
||||||
r_serv.zrem(r_key, fp_filename)
|
|
||||||
r_serv.delete(fp_filename)
|
|
||||||
|
|
||||||
else:
|
|
||||||
#remove the file with longline from the r_key zset.
|
|
||||||
r_serv.zrem(r_key, fp_filename)
|
|
||||||
|
|
||||||
publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key)))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def detect_longline_from_list(r_serv, nb):
|
|
||||||
try:
|
|
||||||
for n in xrange(0,nb):
|
|
||||||
|
|
||||||
if not dectect_longlines(r_serv, "filelist", True):
|
|
||||||
break
|
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit) as e:
|
|
||||||
flush_list_of_pid(r_serv)
|
|
||||||
publisher.debug("Pid list flushed")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -369,182 +83,6 @@ def create_dirfile(r_serv, directory, overwrite):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def redis_interbargraph_set(r_serv, year, month, overwrite):
|
|
||||||
"""Create a Redis sorted set.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param year: -- (integer) The year to process
|
|
||||||
:param month: -- (integer) The month to process
|
|
||||||
:param overwrite: -- (bool) trigger the overwrite mode
|
|
||||||
|
|
||||||
This function create inside redis the intersection of all days in
|
|
||||||
a month two by two.
|
|
||||||
Example:
|
|
||||||
For a month of 31days it will create 30 sorted set between day and
|
|
||||||
day+1 until the last day.
|
|
||||||
The overwrite mode delete the intersets and re-create them.
|
|
||||||
|
|
||||||
"""
|
|
||||||
a = date(year, month, 01)
|
|
||||||
b = date(year, month, cal.monthrange(year, month)[1])
|
|
||||||
|
|
||||||
if overwrite:
|
|
||||||
r_serv.delete("InterSet")
|
|
||||||
|
|
||||||
for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
|
|
||||||
dayafter = dt+timedelta(1)
|
|
||||||
|
|
||||||
r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
|
|
||||||
|
|
||||||
r_serv.zinterstore(
|
|
||||||
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
|
|
||||||
{str(dt.strftime("%Y%m%d")):1,
|
|
||||||
str(dayafter.strftime("%Y%m%d")):-1})
|
|
||||||
|
|
||||||
r_serv.zadd(
|
|
||||||
"InterSet",
|
|
||||||
1,
|
|
||||||
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
|
|
||||||
else:
|
|
||||||
for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
|
|
||||||
dayafter = dt+timedelta(1)
|
|
||||||
|
|
||||||
if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0:
|
|
||||||
|
|
||||||
r_serv.zinterstore(
|
|
||||||
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
|
|
||||||
{str(dt.strftime("%Y%m%d")):1,
|
|
||||||
str(dayafter.strftime("%Y%m%d")):-1})
|
|
||||||
|
|
||||||
r_serv.zadd(
|
|
||||||
"InterSet",
|
|
||||||
1,
|
|
||||||
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
|
|
||||||
|
|
||||||
publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created")
|
|
||||||
|
|
||||||
else:
|
|
||||||
publisher.warning("Data already exist, operation aborted.")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def word_bar_graph(r_serv, year, month, filename):
|
|
||||||
"""Create an histogram.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param year: -- (integer) The year to process
|
|
||||||
:param month: -- (integer) The month to process
|
|
||||||
:param filename: -- The absolute path where to save the figure.png
|
|
||||||
|
|
||||||
This function use matplotlib to create an histogram.
|
|
||||||
The redis database need obviously to be populated first
|
|
||||||
with functions: redis_words_ranking and redis_interbargraph_set.
|
|
||||||
|
|
||||||
"""
|
|
||||||
lw = []
|
|
||||||
adate = []
|
|
||||||
inter = [0]
|
|
||||||
rcParams['figure.figsize'] = 15, 10
|
|
||||||
|
|
||||||
a = date(year, month, 01)
|
|
||||||
b = date(year, month, cal.monthrange(year,month)[1])
|
|
||||||
|
|
||||||
for dt in rrule(DAILY, dtstart = a, until = b):
|
|
||||||
lw.append(r_serv.zcard(dt.strftime("%Y%m%d")))
|
|
||||||
adate.append(dt.strftime("%d"))
|
|
||||||
|
|
||||||
for x in r_serv.zrange("InterSet", 0, 31):
|
|
||||||
inter.append(r_serv.zcard(x))
|
|
||||||
|
|
||||||
n_groups = len(lw)
|
|
||||||
card_words = tuple(lw)
|
|
||||||
card_interword = tuple(inter)
|
|
||||||
|
|
||||||
index = np.arange(n_groups)
|
|
||||||
bar_width = 0.5
|
|
||||||
opacity = 0.6
|
|
||||||
|
|
||||||
words = plt.bar(index, card_words, bar_width,
|
|
||||||
alpha=opacity,
|
|
||||||
color='g',
|
|
||||||
label='Words/day')
|
|
||||||
|
|
||||||
lwords = plt.bar(index - 0.5, card_interword, bar_width,
|
|
||||||
alpha=opacity,
|
|
||||||
color='r',
|
|
||||||
label='Intersection')
|
|
||||||
|
|
||||||
|
|
||||||
plt.plot(tuple(inter), 'b--')
|
|
||||||
plt.xlabel(str(year)+'/'+str(month)+' Days')
|
|
||||||
plt.ylabel('Words')
|
|
||||||
plt.title('Words Cardinality & Intersection Histogram')
|
|
||||||
plt.xticks(index + bar_width/2 , tuple(adate))
|
|
||||||
|
|
||||||
plt.legend()
|
|
||||||
plt.grid()
|
|
||||||
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
|
|
||||||
orientation='portrait', papertype=None, format="png",
|
|
||||||
transparent=False, bbox_inches=None, pad_inches=0.1,
|
|
||||||
frameon=True)
|
|
||||||
|
|
||||||
publisher.info(filename+".png"+" saved!")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_data_words_curve(r_serv, r_serv2, year, month, filename):
|
|
||||||
"""Create a Redis hashes.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database (read)
|
|
||||||
:param r_serv2: -- connexion to redis database (write)
|
|
||||||
:param year: -- (integer) The year to process
|
|
||||||
:param month: -- (integer) The month to process
|
|
||||||
:param filename: -- the path to the file which contain a list of words.
|
|
||||||
|
|
||||||
|
|
||||||
The hashes of redis is created as follow:
|
|
||||||
|
|
||||||
+------------+------------+-----------+
|
|
||||||
| Keys | Field | Values |
|
|
||||||
+============+============+===========+
|
|
||||||
| word1 | 20131001 | 150 |
|
|
||||||
+------------+------------+-----------+
|
|
||||||
| ... | 20131002 | 145 |
|
|
||||||
+------------+------------+-----------+
|
|
||||||
| word2 | ... | ... |
|
|
||||||
+------------+------------+-----------+
|
|
||||||
|
|
||||||
The filename need to be a list of words separated by a carriage return
|
|
||||||
with an empty line at the end.
|
|
||||||
This function create datas which is used by the function
|
|
||||||
create_curve_with_word_file which create a csv file.
|
|
||||||
|
|
||||||
"""
|
|
||||||
stop = stopwords.words('english')
|
|
||||||
a = date(year, month, 01)
|
|
||||||
b = date(year, month, cal.monthrange(year,month)[1])
|
|
||||||
|
|
||||||
with open(filename, 'rb') as F:
|
|
||||||
|
|
||||||
for line in F:
|
|
||||||
|
|
||||||
for dt in rrule(DAILY, dtstart = a, until = b):
|
|
||||||
|
|
||||||
if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None:
|
|
||||||
#tester si ca existe deja "en option" et ajouter un WARNING log
|
|
||||||
r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])})
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
||||||
"""Create a csv file used with dygraph.
|
"""Create a csv file used with dygraph.
|
||||||
|
|
||||||
|
|
|
@ -1,56 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It create an histogram which display the occurency
|
|
||||||
of the words per day but also the intersection of day and day-1 of these
|
|
||||||
occurencies''',
|
|
||||||
epilog = '''The Redis database need to be populated by the script
|
|
||||||
Wordsranking_Populate.py before using this one.''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('y',
|
|
||||||
type = int,
|
|
||||||
metavar = "year",
|
|
||||||
help = 'The year processed.',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('m',
|
|
||||||
type = int,
|
|
||||||
metavar = "month",
|
|
||||||
help = 'The month processed.',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-f',
|
|
||||||
type = str,
|
|
||||||
metavar = "filename",
|
|
||||||
default = "figure",
|
|
||||||
help = 'The absolute path name of the "figure.png"',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
word_bar_graph(r,args.y,args.m, args.f)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,64 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_refine import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It create an histogram which display the occurency
|
|
||||||
of the word category per days.''',
|
|
||||||
epilog = '''The Redis database need to be populated by the script
|
|
||||||
Classify_Paste_Token.py before.
|
|
||||||
It's also usefull to launch Remove_longline_fp.py and Refine_with_regex.py
|
|
||||||
to create a more accurate histogram.
|
|
||||||
example: ./Bargraph_categ_by_day.py 2013 12 mails_categ''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-f',
|
|
||||||
type = str,
|
|
||||||
metavar = "filename",
|
|
||||||
default = "figure",
|
|
||||||
help = 'The absolute path name of the "figure.png"',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('y',
|
|
||||||
type = int,
|
|
||||||
metavar = "year",
|
|
||||||
help = 'The year processed',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('m',
|
|
||||||
type = int,
|
|
||||||
metavar = "month",
|
|
||||||
help = 'The month processed',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('key',
|
|
||||||
type = str,
|
|
||||||
help ='name of the key to process in redis (the word_categ concerned)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
graph_categ_by_day(r, args.f, args.y, args.m, args.key)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,61 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It create sets in redis as much as category
|
|
||||||
defined in the file given by the argument -l ''',
|
|
||||||
epilog = '''Example : seq 5000 | parallel -n0 -j 10
|
|
||||||
./classify_Paste_Token.py -nbp 200''')
|
|
||||||
|
|
||||||
parser.add_argument('-l',
|
|
||||||
type = str,
|
|
||||||
default = "../files/list_categ_files",
|
|
||||||
help = 'Path to the list_categ_files (../files/list_categ_files)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-s',
|
|
||||||
help = 'Datastruct type, swapping between keys & members',
|
|
||||||
action = 'store_true')
|
|
||||||
|
|
||||||
parser.add_argument('-nbp',
|
|
||||||
type = int,
|
|
||||||
default = 200,
|
|
||||||
help = 'Nbpaste',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-set',
|
|
||||||
type = str,
|
|
||||||
default = 'filelist',
|
|
||||||
help = 'The name of the list in redis which contain the filename to tokenise',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
publisher.channel = "youpi"
|
|
||||||
|
|
||||||
classify_token_paste(r, args.l, args.s, args.nbp, args.set)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,46 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It's here to monitor some script which take time
|
|
||||||
and lauched in parallel, You can display which process is running on which
|
|
||||||
paste and how much time it spent processing it''',
|
|
||||||
epilog = 'example : ./Display_pid -p pid -db 1 -d remain')
|
|
||||||
|
|
||||||
parser.add_argument('-d',
|
|
||||||
type = str,
|
|
||||||
default = 'all',
|
|
||||||
choices=['paste', 'up', 'start', 'kb', 'all', 'pid', 'prg', 'remain', 'processed'],
|
|
||||||
help = 'Which info to display ?',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
publisher.channel = "youpi"
|
|
||||||
|
|
||||||
display_listof_pid(r, args.d)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,65 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_gephi import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information
|
|
||||||
Leak framework. It create a gephi graph to have a global
|
|
||||||
view of the pastes but also which one are similar.''',
|
|
||||||
epilog = '''The Redis database need to be populated by the script
|
|
||||||
Populate.py before using this one.''')
|
|
||||||
|
|
||||||
parser.add_argument('-t',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'Type of the Redis population (Same arg than in Populate.py)',
|
|
||||||
choices=[0, 2],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-min',
|
|
||||||
type = int,
|
|
||||||
default = 3,
|
|
||||||
help = 'minimum linked nodes (default 3)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-max',
|
|
||||||
type = int,
|
|
||||||
default = 50,
|
|
||||||
help = 'maximum linked nodes created (execute top.py before for more info)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-p',
|
|
||||||
type = str,
|
|
||||||
default = '../graph/out',
|
|
||||||
metavar = 'path',
|
|
||||||
help = "pathname of the graph file created. ex: /home/graph",
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db,
|
|
||||||
unix_socket_path='/tmp/redis.sock')
|
|
||||||
|
|
||||||
|
|
||||||
Gephi_Graph(r, args.p+".gexf", args.min, args.max, args.t)
|
|
||||||
cprint("GRAPH CREATED AT:{0}.gexf".format(args.p),"green")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
#OK
|
|
|
@ -1,52 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information
|
|
||||||
Leak framework. It create in redis the intersection
|
|
||||||
between all the days two by two of the date given in argument.''',
|
|
||||||
epilog = '''The Redis database need to be populated by the script
|
|
||||||
Wordsranking_Populate.py before using this one.''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('y',
|
|
||||||
type = int,
|
|
||||||
metavar = "year",
|
|
||||||
help = 'The year',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('m',
|
|
||||||
type = int,
|
|
||||||
metavar = "month",
|
|
||||||
help = 'The month',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-ow',
|
|
||||||
help = 'trigger the overwritting mode',
|
|
||||||
action = 'store_true')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
redis_interbargraph_set(r, args.y, args.m, args.ow)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,75 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_redis_insert import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information
|
|
||||||
Leak framework. Is Populate the Redis database with
|
|
||||||
the pastes names and theirs hashed line''',
|
|
||||||
epilog = '''This script need to be run first in order to use the others:
|
|
||||||
Graph.py, Search.py, Top.py ...''')
|
|
||||||
|
|
||||||
parser.add_argument('input',
|
|
||||||
type = str,
|
|
||||||
metavar = 'pathfolder',
|
|
||||||
help = 'Input folder',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-t',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'type of population wanted 0 = set 1 = zset 2 = mix',
|
|
||||||
choices=[0, 1, 2],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-H',
|
|
||||||
type = str,
|
|
||||||
default = 'md5',
|
|
||||||
metavar='hash',
|
|
||||||
help = 'The hash method (default md5)',
|
|
||||||
choices=["md5", "sha1", "crc", "murmur"],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-jmp',
|
|
||||||
type = int,
|
|
||||||
default = 10,
|
|
||||||
metavar = 'jump',
|
|
||||||
help = '''Jumping line factor. 1 = All the line are taken. X = jump X line
|
|
||||||
(default 10)''',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-ml',
|
|
||||||
type = int,
|
|
||||||
default = 1,
|
|
||||||
metavar = 'minlnline',
|
|
||||||
help = '''Length line factor. 1 = All the line are taken.
|
|
||||||
X = each line >= X char (default 1)''',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline()
|
|
||||||
|
|
||||||
redis_populate(p, listdirectory(args.input), args.ml, args.H, args.jmp, args.t)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
#OK
|
|
|
@ -1,78 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_refine import *
|
|
||||||
from packages.imported import *
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information
|
|
||||||
Leak framework. Is refining a redis set by
|
|
||||||
re analysing set with regex and changing the score by the number of
|
|
||||||
regex matching''',
|
|
||||||
epilog = '''example of use: ./Refine_with_regex.py 2013 12 -regex mail
|
|
||||||
-key mails_categ''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-nbm',
|
|
||||||
type = int,
|
|
||||||
default = 1,
|
|
||||||
help = 'Minimum matching regex occurence per file to keep in redis (1)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-regex',
|
|
||||||
type = str,
|
|
||||||
default = 'mail',
|
|
||||||
choices=['mail', 'card', 'url', 'bitcoin'],
|
|
||||||
help = 'Which regex wanted to be use to match',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-key',
|
|
||||||
type = str,
|
|
||||||
default = "mails_categ",
|
|
||||||
help = 'Name of the key to process in redis (same name than the wordlist concerned)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('y',
|
|
||||||
type = int,
|
|
||||||
metavar = "year",
|
|
||||||
help = 'The year processed',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('m',
|
|
||||||
type = int,
|
|
||||||
metavar = "month",
|
|
||||||
help = 'The month processed',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.regex == 'mail':
|
|
||||||
regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
|
|
||||||
elif args.regex == 'card':
|
|
||||||
regex = "4[0-9]{12}(?:[0-9]{3})?"
|
|
||||||
elif args.regex == 'bitcoin':
|
|
||||||
regex = "[13][1-9A-HJ-NP-Za-km-z]{26,33}"
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
publisher.channel = "youpi"
|
|
||||||
|
|
||||||
refining_regex_dataset(r, args.key, regex, args.nbm, args.y, args.m)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,44 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_redis_insert import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information
|
|
||||||
Leak framework. It Add to a temporary list the hash
|
|
||||||
of wholes files and compare the new hash to the element of this
|
|
||||||
list. If the hash is already inside, the file is deleted
|
|
||||||
otherwise the hash is added in the list.''',
|
|
||||||
epilog = '''This script need Redis to be populated before by
|
|
||||||
./Dir.py''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-nbp',
|
|
||||||
type = int,
|
|
||||||
default = 200,
|
|
||||||
help = 'nbpaste',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
remove_pure_doppelganger(r, args.nbp)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,57 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It removes the line which are in redis under
|
|
||||||
the "key" name argument''',
|
|
||||||
epilog = '''This script is usually usefull launched after using
|
|
||||||
./Classify_Paste_Token.py example: ./Remove_longline_fp.py mails_categ''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('key',
|
|
||||||
type = str,
|
|
||||||
help = 'Name of the key to process in redis ("")',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-d',
|
|
||||||
help = 'Delete the set of longline created?',
|
|
||||||
action = 'store_true')
|
|
||||||
|
|
||||||
parser.add_argument('-s',
|
|
||||||
help = 'Store the longline numbers inside a set?',
|
|
||||||
action = 'store_true')
|
|
||||||
|
|
||||||
parser.add_argument('-max',
|
|
||||||
type = int,
|
|
||||||
default = 500,
|
|
||||||
help = 'The limit between "short lines" and "long lines" (500)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
#remove_longline_from_categ(r, args.key, args.d, args.s, args.max)
|
|
||||||
detect_longline_from_list(r,args.max)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,72 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_search import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = 'Analysis Information Leak framework',
|
|
||||||
epilog = 'MSc Student Internship')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
default = 0,
|
|
||||||
type = int,
|
|
||||||
help = 'The name of the Redis DB',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('name',
|
|
||||||
type = str,
|
|
||||||
metavar = 'pastename',
|
|
||||||
help = 'The name of the paste',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-min',
|
|
||||||
type = int,
|
|
||||||
default = 3,
|
|
||||||
help = 'minimum linked hashs (default 3)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-max',
|
|
||||||
type = int,
|
|
||||||
default = 50,
|
|
||||||
help = 'maximum linked hash (execute top.py to be more aware)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-p',
|
|
||||||
type = str,
|
|
||||||
default = '../graph/Search_',
|
|
||||||
metavar = 'path',
|
|
||||||
help = "pathname of the file created.",
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-t',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'Type of the Redis population (Same arg than in Populate.py)',
|
|
||||||
choices=[0, 2],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db,
|
|
||||||
unix_socket_path='/tmp/redis.sock')
|
|
||||||
|
|
||||||
|
|
||||||
if args.t == 2:
|
|
||||||
paste_searching2(r, args.p+args.name+".txt", args.name, args.min, args.max)
|
|
||||||
cprint("GRAPH CREATED AT:{0}{1}.txt".format(args.p,args.name),"green")
|
|
||||||
elif args.t == 0:
|
|
||||||
paste_searching(r, args.p+args.name+".txt", args.name, args.min, args.max)
|
|
||||||
cprint("GRAPH CREATED AT:{0}{1}.txt".format(args.p,args.name),"green")
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,58 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_search import Create_Common_Hash_File
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = '''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It create a text file with the top common hash
|
|
||||||
which are in the redis database''',
|
|
||||||
epilog = '''The Redis database need to be populated by the script
|
|
||||||
Populate.py before using this one.''')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
default = 0,
|
|
||||||
type = int,
|
|
||||||
help = 'The name of the Redis DB',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-off',
|
|
||||||
default = 1,
|
|
||||||
type = int,
|
|
||||||
metavar = 'offset',
|
|
||||||
help = 'Starting point of the toplist',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-top',
|
|
||||||
default = 100,
|
|
||||||
type = int,
|
|
||||||
metavar = '100',
|
|
||||||
help = 'How many occurence? top 10-50-100 ?',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-p',
|
|
||||||
type = str,
|
|
||||||
default = '../graph/top',
|
|
||||||
metavar = 'path',
|
|
||||||
help = "pathname of the file created ex: /home/top",
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
create_common_cash_file(r, args.off, args.top, args.p+str(args.top)+".top")
|
|
||||||
cprint("LIST CREATED","green")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
#OK
|
|
|
@ -1,64 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = 'Analysis Information Leak framework',
|
|
||||||
epilog = 'Thats drawing')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB To get the info (0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-db1',
|
|
||||||
type = int,
|
|
||||||
default = 1,
|
|
||||||
help = 'The name of the Redis DB To store (1)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('f',
|
|
||||||
type = str,
|
|
||||||
metavar= "file",
|
|
||||||
help = 'Words filename',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('y',
|
|
||||||
type = int,
|
|
||||||
metavar = "year",
|
|
||||||
help = 'The year',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('m',
|
|
||||||
type = int,
|
|
||||||
metavar = "month",
|
|
||||||
help = 'The month',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
r2 = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db1)
|
|
||||||
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
create_data_words_curve(r, r2, args.y, args.m, args.f)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,57 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = 'Analysis Information Leak framework',
|
|
||||||
epilog = 'Thats drawing')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-cvs',
|
|
||||||
type = str,
|
|
||||||
metavar = "filename",
|
|
||||||
default = "wordstrendingdata",
|
|
||||||
help = 'The name of the cvs file wanted to be created',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('f',
|
|
||||||
type = str,
|
|
||||||
help = 'The file with the list of words',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('y',
|
|
||||||
type = int,
|
|
||||||
metavar = "year",
|
|
||||||
help = 'The year',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('m',
|
|
||||||
type = int,
|
|
||||||
metavar = "month",
|
|
||||||
help = 'The month',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
create_curve_with_word_file(r, args.cvs, args.f, args.y, args.m)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,54 +0,0 @@
|
||||||
#!/usr/bin/python2.7
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
from packages.lib_words import *
|
|
||||||
from packages.imported import *
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description = 'Analysis Information Leak framework',
|
|
||||||
epilog = 'example : seq 2 | parallel ./Wordsranking_Populate.py -nbp 20')
|
|
||||||
|
|
||||||
parser.add_argument('-nbp',
|
|
||||||
type = int,
|
|
||||||
default = 200,
|
|
||||||
help = 'nbpaste',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-db',
|
|
||||||
type = int,
|
|
||||||
default = 0,
|
|
||||||
help = 'The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4],
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-min',
|
|
||||||
type = int,
|
|
||||||
default = 4,
|
|
||||||
help = 'Minimum length of the inserted words (default 4)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
parser.add_argument('-max',
|
|
||||||
type = int,
|
|
||||||
default = 200,
|
|
||||||
help = 'Maximum length of the inserted words (default 200)',
|
|
||||||
action = 'store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r = redis.StrictRedis(
|
|
||||||
host='localhost',
|
|
||||||
port=6379,
|
|
||||||
db=args.db)
|
|
||||||
|
|
||||||
p = r.pipeline(False)
|
|
||||||
|
|
||||||
publisher.channel = "youpi"
|
|
||||||
|
|
||||||
redis_words_ranking(p, r, args.nbp, args.min, args.max)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
Loading…
Reference in New Issue