From ece3bc173e7c8776992de2e830c440f4cc8fef97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Wed, 13 Aug 2014 11:56:22 +0200 Subject: [PATCH] Cleanup of main Paste module --- bin/packages/Paste.py | 162 ++++++++++++++++++++---------------------- 1 file changed, 77 insertions(+), 85 deletions(-) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 4e5d809c..f8c6dffa 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -16,7 +16,17 @@ Conditions to fulfill to be able to use this class correctly: """ -import os, magic, gzip, langid, pprint, redis, operator, string, re, json, ConfigParser +import os +import magic +import gzip +import pprint +import redis +import operator +import string +import re +import json +import ConfigParser +import cStringIO from Date import Date from Hash import Hash @@ -25,11 +35,10 @@ from langid.langid import LanguageIdentifier, model from nltk.tokenize import RegexpTokenizer from textblob import TextBlob -from lib_refine import * - clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) """It filters out non-printable characters from the string it receives.""" + class Paste(object): """ This class representing a Paste as an object. @@ -50,38 +59,29 @@ class Paste(object): configfile = './packages/config.cfg' cfg = ConfigParser.ConfigParser() cfg.read(configfile) + self.cache = redis.StrictRedis( + host=cfg.get("Redis_Queues", "host"), + port=cfg.getint("Redis_Queues", "port"), + db=cfg.getint("Redis_Queues", "db")) self.p_path = p_path - self.p_name = self.p_path.split('/')[-1] + self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2) + self.p_mime = magic.from_buffer(self.get_p_content(), mime=True) - self.p_size = round(os.path.getsize(self.p_path)/1024.0,2) - - self.cache = redis.StrictRedis( - host = cfg.get("Redis_Queues", "host"), - port = cfg.getint("Redis_Queues", "port"), - db = cfg.getint("Redis_Queues", "db")) - - self.p_mime = magic.from_buffer(self.get_p_content(), mime = True) - - self.p_encoding = None - - #Assuming that the paste will alway be in a day folder which is itself + # Assuming that the paste will alway be in a day folder which is itself # in a month folder which is itself in a year folder. # /year/month/day/paste.gz var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) - - self.p_hash_kind = None - self.p_hash = None - - self.p_langage = None - - self.p_nb_lines = None - self.p_max_length_line = None - self.p_source = var[-5] + self.p_encoding = None + self.p_hash_kind = None + self.p_hash = None + self.p_langage = None + self.p_nb_lines = None + self.p_max_length_line = None def get_p_content(self): """ @@ -92,16 +92,18 @@ class Paste(object): PST.get_p_content() """ - r_serv = self.cache - paste = r_serv.get(self.p_path) + paste = self.cache.get(self.p_path) if paste is None: - with gzip.open(self.p_path, 'rb') as F: - paste = F.read() - r_serv.set(self.p_path, paste) - r_serv.expire(self.p_path, 300) + with gzip.open(self.p_path, 'rb') as f: + paste = f.read() + self.cache.set(self.p_path, paste) + self.cache.expire(self.p_path, 300) return paste + def get_p_content_as_file(self): + return cStringIO.StringIO(self.get_p_content()) + def get_lines_info(self): """ Returning and setting the number of lines and the maximum lenght of the @@ -112,15 +114,17 @@ class Paste(object): :Example: PST.get_lines_info() """ - max_length_line = 0 - with gzip.open(self.p_path, 'rb') as F: - for nb_line in enumerate(F): - if len(nb_line[1]) >= max_length_line: - max_length_line = len(nb_line[1]) - - self.p_nb_lines = nb_line[0] - self.p_max_length_line = max_length_line - return (nb_line[0], max_length_line) + if self.p_nb_lines is None or self.p_max_length_line is None: + max_length_line = 0 + f = self.get_p_content_as_file() + for line_id, line in enumerate(f): + length = len(line) + if length >= max_length_line: + max_length_line = length + f.close() + self.p_nb_lines = line_id + self.p_max_length_line = max_length_line + return (self.p_nb_lines, self.p_max_length_line) def _get_p_encoding(self): """ @@ -130,11 +134,10 @@ class Paste(object): """ try: - return magic.Magic(mime_encoding = True).from_buffer(self.get_p_content()) + return magic.Magic(mime_encoding=True).from_buffer(self.get_p_content()) except magic.MagicException: pass - def _set_p_hash_kind(self, hashkind): """ Setting the hash (as an object) used for futur operation on it. @@ -173,9 +176,7 @@ class Paste(object): ..seealso: git@github.com:saffsd/langid.py.git """ - identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) - return identifier.classify(self.get_p_content()) def _get_p_hash_kind(self): @@ -184,7 +185,7 @@ class Paste(object): def _get_p_date(self): return self.p_date - def _get_hash_lines(self, min = 1, start = 1, jump = 10): + def _get_hash_lines(self, min=1, start=1, jump=10): """ Returning all the lines of the paste hashed. @@ -210,20 +211,17 @@ class Paste(object): """ S = set([]) - with gzip.open(self.p_path, 'rb') as F: - - for num, line in enumerate(F, start): - - if len(line) >= min: - if jump > 1: - if (num % jump) == 1 : - S.add(self.p_hash_kind.Calculate(line)) - else: + f = self.get_p_content_as_file() + for num, line in enumerate(f, start): + if len(line) >= min: + if jump > 1: + if (num % jump) == 1: S.add(self.p_hash_kind.Calculate(line)) + else: + S.add(self.p_hash_kind.Calculate(line)) return S - - def is_duplicate(self, obj, min = 1, percent = 50, start = 1, jump = 10): + def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10): """ Returning the percent of similarity with another paste. ( Using the previous hashing method ) @@ -264,8 +262,7 @@ class Paste(object): else: return False, var - - def save_all_attributes_redis(self, r_serv, key = None): + def save_all_attributes_redis(self, r_serv, key=None): """ Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) @@ -281,23 +278,25 @@ class Paste(object): PST.save_all_attributes_redis(r_serv) """ - #LevelDB Compatibility - r_serv.hset(self.p_path, "p_name", self.p_name) - r_serv.hset(self.p_path, "p_size", self.p_size) - r_serv.hset(self.p_path, "p_mime", self.p_mime) - #r_serv.hset(self.p_path, "p_encoding", self.p_encoding) - r_serv.hset(self.p_path, "p_date", self._get_p_date()) - r_serv.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind()) - r_serv.hset(self.p_path, "p_hash", self.p_hash) - #r_serv.hset(self.p_path, "p_langage", self.p_langage) - #r_serv.hset(self.p_path, "p_nb_lines", self.p_nb_lines) - #r_serv.hset(self.p_path, "p_max_length_line", self.p_max_length_line) - #r_serv.hset(self.p_path, "p_categories", self.p_categories) - r_serv.hset(self.p_path, "p_source", self.p_source) - if key != None: - r_serv.sadd(key, self.p_path) + # LevelDB Compatibility + p = r_serv.pipeline(False) + p.hset(self.p_path, "p_name", self.p_name) + p.hset(self.p_path, "p_size", self.p_size) + p.hset(self.p_path, "p_mime", self.p_mime) + # p.hset(self.p_path, "p_encoding", self.p_encoding) + p.hset(self.p_path, "p_date", self._get_p_date()) + p.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind()) + p.hset(self.p_path, "p_hash", self.p_hash) + # p.hset(self.p_path, "p_langage", self.p_langage) + # p.hset(self.p_path, "p_nb_lines", self.p_nb_lines) + # p.hset(self.p_path, "p_max_length_line", self.p_max_length_line) + # p.hset(self.p_path, "p_categories", self.p_categories) + p.hset(self.p_path, "p_source", self.p_source) + if key is not None: + p.sadd(key, self.p_path) else: pass + p.execute() def save_attribute_redis(self, r_serv, attr_name, value): """ @@ -308,11 +307,10 @@ class Paste(object): else: r_serv.hset(self.p_path, attr_name, json.dumps(value)) - def _get_from_redis(self,r_serv): + def _get_from_redis(self, r_serv): return r_serv.hgetall(self.p_hash) - - def _get_top_words(self, sort = False): + def _get_top_words(self, sort=False): """ Tokenising method: Returning a sorted list or a set of paste's words @@ -325,28 +323,23 @@ class Paste(object): """ words = {} tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', - gaps = True, - discard_empty = True) + gaps=True, discard_empty=True) - blob = TextBlob(clean(self.get_p_content()), - tokenizer = tokenizer) + blob = TextBlob(clean(self.get_p_content()), tokenizer=tokenizer) for word in blob.tokens: if word in words.keys(): num = words[word] else: num = 0 - words[word] = num + 1 - if sort: - var = sorted(words.iteritems(), key = operator.itemgetter(1), reverse = True) + var = sorted(words.iteritems(), key=operator.itemgetter(1), reverse=True) else: var = words return var - def _get_word(self, word): """ Returning a specific word and his occurence if present in the paste @@ -358,7 +351,6 @@ class Paste(object): """ return [item for item in self._get_top_words() if item[0] == word] - def get_regex(self, regex): """ Returning matches with the regex given as an argument.