mirror of https://github.com/CIRCL/AIL-framework
Cleanup of main Paste module
parent
5b17d416c8
commit
ece3bc173e
|
@ -16,7 +16,17 @@ Conditions to fulfill to be able to use this class correctly:
|
|||
|
||||
"""
|
||||
|
||||
import os, magic, gzip, langid, pprint, redis, operator, string, re, json, ConfigParser
|
||||
import os
|
||||
import magic
|
||||
import gzip
|
||||
import pprint
|
||||
import redis
|
||||
import operator
|
||||
import string
|
||||
import re
|
||||
import json
|
||||
import ConfigParser
|
||||
import cStringIO
|
||||
from Date import Date
|
||||
from Hash import Hash
|
||||
|
||||
|
@ -25,11 +35,10 @@ from langid.langid import LanguageIdentifier, model
|
|||
from nltk.tokenize import RegexpTokenizer
|
||||
from textblob import TextBlob
|
||||
|
||||
from lib_refine import *
|
||||
|
||||
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
||||
"""It filters out non-printable characters from the string it receives."""
|
||||
|
||||
|
||||
class Paste(object):
|
||||
"""
|
||||
This class representing a Paste as an object.
|
||||
|
@ -50,38 +59,29 @@ class Paste(object):
|
|||
configfile = './packages/config.cfg'
|
||||
cfg = ConfigParser.ConfigParser()
|
||||
cfg.read(configfile)
|
||||
|
||||
self.p_path = p_path
|
||||
|
||||
self.p_name = self.p_path.split('/')[-1]
|
||||
|
||||
self.p_size = round(os.path.getsize(self.p_path)/1024.0,2)
|
||||
|
||||
self.cache = redis.StrictRedis(
|
||||
host=cfg.get("Redis_Queues", "host"),
|
||||
port=cfg.getint("Redis_Queues", "port"),
|
||||
db=cfg.getint("Redis_Queues", "db"))
|
||||
|
||||
self.p_path = p_path
|
||||
self.p_name = self.p_path.split('/')[-1]
|
||||
self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2)
|
||||
self.p_mime = magic.from_buffer(self.get_p_content(), mime=True)
|
||||
|
||||
self.p_encoding = None
|
||||
|
||||
# Assuming that the paste will alway be in a day folder which is itself
|
||||
# in a month folder which is itself in a year folder.
|
||||
# /year/month/day/paste.gz
|
||||
var = self.p_path.split('/')
|
||||
self.p_date = Date(var[-4], var[-3], var[-2])
|
||||
|
||||
self.p_hash_kind = None
|
||||
self.p_hash = None
|
||||
|
||||
self.p_langage = None
|
||||
|
||||
self.p_nb_lines = None
|
||||
self.p_max_length_line = None
|
||||
|
||||
self.p_source = var[-5]
|
||||
|
||||
self.p_encoding = None
|
||||
self.p_hash_kind = None
|
||||
self.p_hash = None
|
||||
self.p_langage = None
|
||||
self.p_nb_lines = None
|
||||
self.p_max_length_line = None
|
||||
|
||||
def get_p_content(self):
|
||||
"""
|
||||
|
@ -92,16 +92,18 @@ class Paste(object):
|
|||
PST.get_p_content()
|
||||
|
||||
"""
|
||||
r_serv = self.cache
|
||||
|
||||
paste = r_serv.get(self.p_path)
|
||||
paste = self.cache.get(self.p_path)
|
||||
if paste is None:
|
||||
with gzip.open(self.p_path, 'rb') as F:
|
||||
paste = F.read()
|
||||
r_serv.set(self.p_path, paste)
|
||||
r_serv.expire(self.p_path, 300)
|
||||
with gzip.open(self.p_path, 'rb') as f:
|
||||
paste = f.read()
|
||||
self.cache.set(self.p_path, paste)
|
||||
self.cache.expire(self.p_path, 300)
|
||||
return paste
|
||||
|
||||
def get_p_content_as_file(self):
|
||||
return cStringIO.StringIO(self.get_p_content())
|
||||
|
||||
def get_lines_info(self):
|
||||
"""
|
||||
Returning and setting the number of lines and the maximum lenght of the
|
||||
|
@ -112,15 +114,17 @@ class Paste(object):
|
|||
:Example: PST.get_lines_info()
|
||||
|
||||
"""
|
||||
if self.p_nb_lines is None or self.p_max_length_line is None:
|
||||
max_length_line = 0
|
||||
with gzip.open(self.p_path, 'rb') as F:
|
||||
for nb_line in enumerate(F):
|
||||
if len(nb_line[1]) >= max_length_line:
|
||||
max_length_line = len(nb_line[1])
|
||||
|
||||
self.p_nb_lines = nb_line[0]
|
||||
f = self.get_p_content_as_file()
|
||||
for line_id, line in enumerate(f):
|
||||
length = len(line)
|
||||
if length >= max_length_line:
|
||||
max_length_line = length
|
||||
f.close()
|
||||
self.p_nb_lines = line_id
|
||||
self.p_max_length_line = max_length_line
|
||||
return (nb_line[0], max_length_line)
|
||||
return (self.p_nb_lines, self.p_max_length_line)
|
||||
|
||||
def _get_p_encoding(self):
|
||||
"""
|
||||
|
@ -134,7 +138,6 @@ class Paste(object):
|
|||
except magic.MagicException:
|
||||
pass
|
||||
|
||||
|
||||
def _set_p_hash_kind(self, hashkind):
|
||||
"""
|
||||
Setting the hash (as an object) used for futur operation on it.
|
||||
|
@ -173,9 +176,7 @@ class Paste(object):
|
|||
..seealso: git@github.com:saffsd/langid.py.git
|
||||
|
||||
"""
|
||||
|
||||
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
||||
|
||||
return identifier.classify(self.get_p_content())
|
||||
|
||||
def _get_p_hash_kind(self):
|
||||
|
@ -210,10 +211,8 @@ class Paste(object):
|
|||
|
||||
"""
|
||||
S = set([])
|
||||
with gzip.open(self.p_path, 'rb') as F:
|
||||
|
||||
for num, line in enumerate(F, start):
|
||||
|
||||
f = self.get_p_content_as_file()
|
||||
for num, line in enumerate(f, start):
|
||||
if len(line) >= min:
|
||||
if jump > 1:
|
||||
if (num % jump) == 1:
|
||||
|
@ -222,7 +221,6 @@ class Paste(object):
|
|||
S.add(self.p_hash_kind.Calculate(line))
|
||||
return S
|
||||
|
||||
|
||||
def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
|
||||
"""
|
||||
Returning the percent of similarity with another paste.
|
||||
|
@ -264,7 +262,6 @@ class Paste(object):
|
|||
else:
|
||||
return False, var
|
||||
|
||||
|
||||
def save_all_attributes_redis(self, r_serv, key=None):
|
||||
"""
|
||||
Saving all the attributes in a "Redis-like" Database (Redis, LevelDB)
|
||||
|
@ -282,22 +279,24 @@ class Paste(object):
|
|||
|
||||
"""
|
||||
# LevelDB Compatibility
|
||||
r_serv.hset(self.p_path, "p_name", self.p_name)
|
||||
r_serv.hset(self.p_path, "p_size", self.p_size)
|
||||
r_serv.hset(self.p_path, "p_mime", self.p_mime)
|
||||
#r_serv.hset(self.p_path, "p_encoding", self.p_encoding)
|
||||
r_serv.hset(self.p_path, "p_date", self._get_p_date())
|
||||
r_serv.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind())
|
||||
r_serv.hset(self.p_path, "p_hash", self.p_hash)
|
||||
#r_serv.hset(self.p_path, "p_langage", self.p_langage)
|
||||
#r_serv.hset(self.p_path, "p_nb_lines", self.p_nb_lines)
|
||||
#r_serv.hset(self.p_path, "p_max_length_line", self.p_max_length_line)
|
||||
#r_serv.hset(self.p_path, "p_categories", self.p_categories)
|
||||
r_serv.hset(self.p_path, "p_source", self.p_source)
|
||||
if key != None:
|
||||
r_serv.sadd(key, self.p_path)
|
||||
p = r_serv.pipeline(False)
|
||||
p.hset(self.p_path, "p_name", self.p_name)
|
||||
p.hset(self.p_path, "p_size", self.p_size)
|
||||
p.hset(self.p_path, "p_mime", self.p_mime)
|
||||
# p.hset(self.p_path, "p_encoding", self.p_encoding)
|
||||
p.hset(self.p_path, "p_date", self._get_p_date())
|
||||
p.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind())
|
||||
p.hset(self.p_path, "p_hash", self.p_hash)
|
||||
# p.hset(self.p_path, "p_langage", self.p_langage)
|
||||
# p.hset(self.p_path, "p_nb_lines", self.p_nb_lines)
|
||||
# p.hset(self.p_path, "p_max_length_line", self.p_max_length_line)
|
||||
# p.hset(self.p_path, "p_categories", self.p_categories)
|
||||
p.hset(self.p_path, "p_source", self.p_source)
|
||||
if key is not None:
|
||||
p.sadd(key, self.p_path)
|
||||
else:
|
||||
pass
|
||||
p.execute()
|
||||
|
||||
def save_attribute_redis(self, r_serv, attr_name, value):
|
||||
"""
|
||||
|
@ -311,7 +310,6 @@ class Paste(object):
|
|||
def _get_from_redis(self, r_serv):
|
||||
return r_serv.hgetall(self.p_hash)
|
||||
|
||||
|
||||
def _get_top_words(self, sort=False):
|
||||
"""
|
||||
Tokenising method: Returning a sorted list or a set of paste's words
|
||||
|
@ -325,20 +323,16 @@ class Paste(object):
|
|||
"""
|
||||
words = {}
|
||||
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
|
||||
gaps = True,
|
||||
discard_empty = True)
|
||||
gaps=True, discard_empty=True)
|
||||
|
||||
blob = TextBlob(clean(self.get_p_content()),
|
||||
tokenizer = tokenizer)
|
||||
blob = TextBlob(clean(self.get_p_content()), tokenizer=tokenizer)
|
||||
|
||||
for word in blob.tokens:
|
||||
if word in words.keys():
|
||||
num = words[word]
|
||||
else:
|
||||
num = 0
|
||||
|
||||
words[word] = num + 1
|
||||
|
||||
if sort:
|
||||
var = sorted(words.iteritems(), key=operator.itemgetter(1), reverse=True)
|
||||
else:
|
||||
|
@ -346,7 +340,6 @@ class Paste(object):
|
|||
|
||||
return var
|
||||
|
||||
|
||||
def _get_word(self, word):
|
||||
"""
|
||||
Returning a specific word and his occurence if present in the paste
|
||||
|
@ -358,7 +351,6 @@ class Paste(object):
|
|||
"""
|
||||
return [item for item in self._get_top_words() if item[0] == word]
|
||||
|
||||
|
||||
def get_regex(self, regex):
|
||||
"""
|
||||
Returning matches with the regex given as an argument.
|
||||
|
|
Loading…
Reference in New Issue