mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			Cleanup of main Paste module
							parent
							
								
									5b17d416c8
								
							
						
					
					
						commit
						ece3bc173e
					
				|  | @ -16,7 +16,17 @@ Conditions to fulfill to be able to use this class correctly: | |||
| 
 | ||||
| """ | ||||
| 
 | ||||
| import os, magic, gzip, langid, pprint, redis, operator, string, re, json, ConfigParser | ||||
| import os | ||||
| import magic | ||||
| import gzip | ||||
| import pprint | ||||
| import redis | ||||
| import operator | ||||
| import string | ||||
| import re | ||||
| import json | ||||
| import ConfigParser | ||||
| import cStringIO | ||||
| from Date import Date | ||||
| from Hash import Hash | ||||
| 
 | ||||
|  | @ -25,11 +35,10 @@ from langid.langid import LanguageIdentifier, model | |||
| from nltk.tokenize import RegexpTokenizer | ||||
| from textblob import TextBlob | ||||
| 
 | ||||
| from lib_refine import * | ||||
| 
 | ||||
| clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) | ||||
| """It filters out non-printable characters from the string it receives.""" | ||||
| 
 | ||||
| 
 | ||||
| class Paste(object): | ||||
|     """ | ||||
|     This class representing a Paste as an object. | ||||
|  | @ -50,38 +59,29 @@ class Paste(object): | |||
|         configfile = './packages/config.cfg' | ||||
|         cfg = ConfigParser.ConfigParser() | ||||
|         cfg.read(configfile) | ||||
|         self.cache = redis.StrictRedis( | ||||
|             host=cfg.get("Redis_Queues", "host"), | ||||
|             port=cfg.getint("Redis_Queues", "port"), | ||||
|             db=cfg.getint("Redis_Queues", "db")) | ||||
| 
 | ||||
|         self.p_path = p_path | ||||
| 
 | ||||
|         self.p_name = self.p_path.split('/')[-1] | ||||
|         self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2) | ||||
|         self.p_mime = magic.from_buffer(self.get_p_content(), mime=True) | ||||
| 
 | ||||
|         self.p_size = round(os.path.getsize(self.p_path)/1024.0,2) | ||||
| 
 | ||||
|         self.cache = redis.StrictRedis( | ||||
|             host = cfg.get("Redis_Queues", "host"), | ||||
|             port = cfg.getint("Redis_Queues", "port"), | ||||
|             db = cfg.getint("Redis_Queues", "db")) | ||||
| 
 | ||||
|         self.p_mime = magic.from_buffer(self.get_p_content(), mime = True) | ||||
| 
 | ||||
|         self.p_encoding = None | ||||
| 
 | ||||
|         #Assuming that the paste will alway be in a day folder which is itself | ||||
|         # Assuming that the paste will alway be in a day folder which is itself | ||||
|         # in a month folder which is itself in a year folder. | ||||
|         # /year/month/day/paste.gz | ||||
|         var = self.p_path.split('/') | ||||
|         self.p_date = Date(var[-4], var[-3], var[-2]) | ||||
| 
 | ||||
|         self.p_hash_kind = None | ||||
|         self.p_hash = None | ||||
| 
 | ||||
|         self.p_langage = None | ||||
| 
 | ||||
|         self.p_nb_lines = None | ||||
|         self.p_max_length_line = None | ||||
| 
 | ||||
|         self.p_source = var[-5] | ||||
| 
 | ||||
|         self.p_encoding = None | ||||
|         self.p_hash_kind = None | ||||
|         self.p_hash = None | ||||
|         self.p_langage = None | ||||
|         self.p_nb_lines = None | ||||
|         self.p_max_length_line = None | ||||
| 
 | ||||
|     def get_p_content(self): | ||||
|         """ | ||||
|  | @ -92,16 +92,18 @@ class Paste(object): | |||
|         PST.get_p_content() | ||||
| 
 | ||||
|         """ | ||||
|         r_serv = self.cache | ||||
| 
 | ||||
|         paste = r_serv.get(self.p_path) | ||||
|         paste = self.cache.get(self.p_path) | ||||
|         if paste is None: | ||||
|             with gzip.open(self.p_path, 'rb') as F: | ||||
| 		paste = F.read() | ||||
| 		r_serv.set(self.p_path, paste) | ||||
|                 r_serv.expire(self.p_path, 300) | ||||
|             with gzip.open(self.p_path, 'rb') as f: | ||||
|                 paste = f.read() | ||||
|                 self.cache.set(self.p_path, paste) | ||||
|                 self.cache.expire(self.p_path, 300) | ||||
|         return paste | ||||
| 
 | ||||
|     def get_p_content_as_file(self): | ||||
|         return cStringIO.StringIO(self.get_p_content()) | ||||
| 
 | ||||
|     def get_lines_info(self): | ||||
|         """ | ||||
|         Returning and setting the number of lines and the maximum lenght of the | ||||
|  | @ -112,15 +114,17 @@ class Paste(object): | |||
|         :Example: PST.get_lines_info() | ||||
| 
 | ||||
|         """ | ||||
|         max_length_line = 0 | ||||
|         with gzip.open(self.p_path, 'rb') as F: | ||||
|             for nb_line in enumerate(F): | ||||
|                 if len(nb_line[1]) >= max_length_line: | ||||
|                     max_length_line = len(nb_line[1]) | ||||
| 
 | ||||
|         self.p_nb_lines = nb_line[0] | ||||
|         self.p_max_length_line = max_length_line | ||||
|         return (nb_line[0], max_length_line) | ||||
|         if self.p_nb_lines is None or self.p_max_length_line is None: | ||||
|             max_length_line = 0 | ||||
|             f = self.get_p_content_as_file() | ||||
|             for line_id, line in enumerate(f): | ||||
|                 length = len(line) | ||||
|                 if length >= max_length_line: | ||||
|                     max_length_line = length | ||||
|             f.close() | ||||
|             self.p_nb_lines = line_id | ||||
|             self.p_max_length_line = max_length_line | ||||
|         return (self.p_nb_lines, self.p_max_length_line) | ||||
| 
 | ||||
|     def _get_p_encoding(self): | ||||
|         """ | ||||
|  | @ -130,11 +134,10 @@ class Paste(object): | |||
| 
 | ||||
|         """ | ||||
|         try: | ||||
|             return magic.Magic(mime_encoding = True).from_buffer(self.get_p_content()) | ||||
|             return magic.Magic(mime_encoding=True).from_buffer(self.get_p_content()) | ||||
|         except magic.MagicException: | ||||
|             pass | ||||
| 
 | ||||
| 
 | ||||
|     def _set_p_hash_kind(self, hashkind): | ||||
|         """ | ||||
|         Setting the hash (as an object) used for futur operation on it. | ||||
|  | @ -173,9 +176,7 @@ class Paste(object): | |||
|         ..seealso: git@github.com:saffsd/langid.py.git | ||||
| 
 | ||||
|         """ | ||||
| 
 | ||||
|         identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) | ||||
| 
 | ||||
|         return identifier.classify(self.get_p_content()) | ||||
| 
 | ||||
|     def _get_p_hash_kind(self): | ||||
|  | @ -184,7 +185,7 @@ class Paste(object): | |||
|     def _get_p_date(self): | ||||
|         return self.p_date | ||||
| 
 | ||||
|     def _get_hash_lines(self, min = 1, start = 1, jump = 10): | ||||
|     def _get_hash_lines(self, min=1, start=1, jump=10): | ||||
|         """ | ||||
|         Returning all the lines of the paste hashed. | ||||
| 
 | ||||
|  | @ -210,20 +211,17 @@ class Paste(object): | |||
| 
 | ||||
|         """ | ||||
|         S = set([]) | ||||
|         with gzip.open(self.p_path, 'rb') as F: | ||||
| 
 | ||||
|             for num, line in enumerate(F, start): | ||||
| 
 | ||||
|                 if len(line) >= min: | ||||
|                     if jump > 1: | ||||
|                         if (num % jump) == 1 : | ||||
|                             S.add(self.p_hash_kind.Calculate(line)) | ||||
|                     else: | ||||
|         f = self.get_p_content_as_file() | ||||
|         for num, line in enumerate(f, start): | ||||
|             if len(line) >= min: | ||||
|                 if jump > 1: | ||||
|                     if (num % jump) == 1: | ||||
|                         S.add(self.p_hash_kind.Calculate(line)) | ||||
|                 else: | ||||
|                     S.add(self.p_hash_kind.Calculate(line)) | ||||
|         return S | ||||
| 
 | ||||
| 
 | ||||
|     def is_duplicate(self, obj, min = 1, percent = 50, start = 1, jump = 10): | ||||
|     def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10): | ||||
|         """ | ||||
|         Returning the percent of similarity with another paste. | ||||
|         ( Using the previous hashing method ) | ||||
|  | @ -264,8 +262,7 @@ class Paste(object): | |||
|         else: | ||||
|             return False, var | ||||
| 
 | ||||
| 
 | ||||
|     def save_all_attributes_redis(self, r_serv, key = None): | ||||
|     def save_all_attributes_redis(self, r_serv, key=None): | ||||
|         """ | ||||
|         Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) | ||||
| 
 | ||||
|  | @ -281,23 +278,25 @@ class Paste(object): | |||
|         PST.save_all_attributes_redis(r_serv) | ||||
| 
 | ||||
|         """ | ||||
|         #LevelDB Compatibility | ||||
|         r_serv.hset(self.p_path, "p_name", self.p_name) | ||||
|         r_serv.hset(self.p_path, "p_size", self.p_size) | ||||
|         r_serv.hset(self.p_path, "p_mime", self.p_mime) | ||||
|         #r_serv.hset(self.p_path, "p_encoding", self.p_encoding) | ||||
|         r_serv.hset(self.p_path, "p_date", self._get_p_date()) | ||||
|         r_serv.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind()) | ||||
|         r_serv.hset(self.p_path, "p_hash", self.p_hash) | ||||
|         #r_serv.hset(self.p_path, "p_langage", self.p_langage) | ||||
|         #r_serv.hset(self.p_path, "p_nb_lines", self.p_nb_lines) | ||||
|         #r_serv.hset(self.p_path, "p_max_length_line", self.p_max_length_line) | ||||
|         #r_serv.hset(self.p_path, "p_categories", self.p_categories) | ||||
|         r_serv.hset(self.p_path, "p_source", self.p_source) | ||||
|         if key != None: | ||||
|             r_serv.sadd(key, self.p_path) | ||||
|         # LevelDB Compatibility | ||||
|         p = r_serv.pipeline(False) | ||||
|         p.hset(self.p_path, "p_name", self.p_name) | ||||
|         p.hset(self.p_path, "p_size", self.p_size) | ||||
|         p.hset(self.p_path, "p_mime", self.p_mime) | ||||
|         # p.hset(self.p_path, "p_encoding", self.p_encoding) | ||||
|         p.hset(self.p_path, "p_date", self._get_p_date()) | ||||
|         p.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind()) | ||||
|         p.hset(self.p_path, "p_hash", self.p_hash) | ||||
|         # p.hset(self.p_path, "p_langage", self.p_langage) | ||||
|         # p.hset(self.p_path, "p_nb_lines", self.p_nb_lines) | ||||
|         # p.hset(self.p_path, "p_max_length_line", self.p_max_length_line) | ||||
|         # p.hset(self.p_path, "p_categories", self.p_categories) | ||||
|         p.hset(self.p_path, "p_source", self.p_source) | ||||
|         if key is not None: | ||||
|             p.sadd(key, self.p_path) | ||||
|         else: | ||||
|             pass | ||||
|         p.execute() | ||||
| 
 | ||||
|     def save_attribute_redis(self, r_serv, attr_name, value): | ||||
|         """ | ||||
|  | @ -308,11 +307,10 @@ class Paste(object): | |||
|         else: | ||||
|             r_serv.hset(self.p_path, attr_name, json.dumps(value)) | ||||
| 
 | ||||
|     def _get_from_redis(self,r_serv): | ||||
|     def _get_from_redis(self, r_serv): | ||||
|         return r_serv.hgetall(self.p_hash) | ||||
| 
 | ||||
| 
 | ||||
|     def _get_top_words(self, sort = False): | ||||
|     def _get_top_words(self, sort=False): | ||||
|         """ | ||||
|         Tokenising method: Returning a sorted list or a set of paste's words | ||||
| 
 | ||||
|  | @ -325,28 +323,23 @@ class Paste(object): | |||
|         """ | ||||
|         words = {} | ||||
|         tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', | ||||
|         gaps = True, | ||||
|         discard_empty = True) | ||||
|                                     gaps=True, discard_empty=True) | ||||
| 
 | ||||
|         blob = TextBlob(clean(self.get_p_content()), | ||||
|         tokenizer = tokenizer) | ||||
|         blob = TextBlob(clean(self.get_p_content()), tokenizer=tokenizer) | ||||
| 
 | ||||
|         for word in blob.tokens: | ||||
|             if word in words.keys(): | ||||
|                 num = words[word] | ||||
|             else: | ||||
|                 num = 0 | ||||
| 
 | ||||
|             words[word] = num + 1 | ||||
| 
 | ||||
|         if sort: | ||||
|             var = sorted(words.iteritems(), key = operator.itemgetter(1), reverse = True) | ||||
|             var = sorted(words.iteritems(), key=operator.itemgetter(1), reverse=True) | ||||
|         else: | ||||
|             var = words | ||||
| 
 | ||||
|         return var | ||||
| 
 | ||||
| 
 | ||||
|     def _get_word(self, word): | ||||
|         """ | ||||
|         Returning a specific word and his occurence if present in the paste | ||||
|  | @ -358,7 +351,6 @@ class Paste(object): | |||
|         """ | ||||
|         return [item for item in self._get_top_words() if item[0] == word] | ||||
| 
 | ||||
| 
 | ||||
|     def get_regex(self, regex): | ||||
|         """ | ||||
|         Returning matches with the regex given as an argument. | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Raphaël Vinot
						Raphaël Vinot