From ece3bc173e7c8776992de2e830c440f4cc8fef97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= <raphael@vinot.info>
Date: Wed, 13 Aug 2014 11:56:22 +0200
Subject: [PATCH] Cleanup of main Paste module

---
 bin/packages/Paste.py | 162 ++++++++++++++++++++----------------------
 1 file changed, 77 insertions(+), 85 deletions(-)

diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py
index 4e5d809c..f8c6dffa 100755
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@@ -16,7 +16,17 @@ Conditions to fulfill to be able to use this class correctly:
 
 """
 
-import os, magic, gzip, langid, pprint, redis, operator, string, re, json, ConfigParser
+import os
+import magic
+import gzip
+import pprint
+import redis
+import operator
+import string
+import re
+import json
+import ConfigParser
+import cStringIO
 from Date import Date
 from Hash import Hash
 
@@ -25,11 +35,10 @@ from langid.langid import LanguageIdentifier, model
 from nltk.tokenize import RegexpTokenizer
 from textblob import TextBlob
 
-from lib_refine import *
-
 clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
 """It filters out non-printable characters from the string it receives."""
 
+
 class Paste(object):
     """
     This class representing a Paste as an object.
@@ -50,38 +59,29 @@ class Paste(object):
         configfile = './packages/config.cfg'
         cfg = ConfigParser.ConfigParser()
         cfg.read(configfile)
+        self.cache = redis.StrictRedis(
+            host=cfg.get("Redis_Queues", "host"),
+            port=cfg.getint("Redis_Queues", "port"),
+            db=cfg.getint("Redis_Queues", "db"))
 
         self.p_path = p_path
-
         self.p_name = self.p_path.split('/')[-1]
+        self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2)
+        self.p_mime = magic.from_buffer(self.get_p_content(), mime=True)
 
-        self.p_size = round(os.path.getsize(self.p_path)/1024.0,2)
-
-        self.cache = redis.StrictRedis(
-            host = cfg.get("Redis_Queues", "host"),
-            port = cfg.getint("Redis_Queues", "port"),
-            db = cfg.getint("Redis_Queues", "db"))
-
-        self.p_mime = magic.from_buffer(self.get_p_content(), mime = True)
-
-        self.p_encoding = None
-
-        #Assuming that the paste will alway be in a day folder which is itself
+        # Assuming that the paste will alway be in a day folder which is itself
         # in a month folder which is itself in a year folder.
         # /year/month/day/paste.gz
         var = self.p_path.split('/')
         self.p_date = Date(var[-4], var[-3], var[-2])
-
-        self.p_hash_kind = None
-        self.p_hash = None
-
-        self.p_langage = None
-
-        self.p_nb_lines = None
-        self.p_max_length_line = None
-
         self.p_source = var[-5]
 
+        self.p_encoding = None
+        self.p_hash_kind = None
+        self.p_hash = None
+        self.p_langage = None
+        self.p_nb_lines = None
+        self.p_max_length_line = None
 
     def get_p_content(self):
         """
@@ -92,16 +92,18 @@ class Paste(object):
         PST.get_p_content()
 
         """
-        r_serv = self.cache
 
-        paste = r_serv.get(self.p_path)
+        paste = self.cache.get(self.p_path)
         if paste is None:
-            with gzip.open(self.p_path, 'rb') as F:
-		paste = F.read()
-		r_serv.set(self.p_path, paste)
-                r_serv.expire(self.p_path, 300)
+            with gzip.open(self.p_path, 'rb') as f:
+                paste = f.read()
+                self.cache.set(self.p_path, paste)
+                self.cache.expire(self.p_path, 300)
         return paste
 
+    def get_p_content_as_file(self):
+        return cStringIO.StringIO(self.get_p_content())
+
     def get_lines_info(self):
         """
         Returning and setting the number of lines and the maximum lenght of the
@@ -112,15 +114,17 @@ class Paste(object):
         :Example: PST.get_lines_info()
 
         """
-        max_length_line = 0
-        with gzip.open(self.p_path, 'rb') as F:
-            for nb_line in enumerate(F):
-                if len(nb_line[1]) >= max_length_line:
-                    max_length_line = len(nb_line[1])
-
-        self.p_nb_lines = nb_line[0]
-        self.p_max_length_line = max_length_line
-        return (nb_line[0], max_length_line)
+        if self.p_nb_lines is None or self.p_max_length_line is None:
+            max_length_line = 0
+            f = self.get_p_content_as_file()
+            for line_id, line in enumerate(f):
+                length = len(line)
+                if length >= max_length_line:
+                    max_length_line = length
+            f.close()
+            self.p_nb_lines = line_id
+            self.p_max_length_line = max_length_line
+        return (self.p_nb_lines, self.p_max_length_line)
 
     def _get_p_encoding(self):
         """
@@ -130,11 +134,10 @@ class Paste(object):
 
         """
         try:
-            return magic.Magic(mime_encoding = True).from_buffer(self.get_p_content())
+            return magic.Magic(mime_encoding=True).from_buffer(self.get_p_content())
         except magic.MagicException:
             pass
 
-
     def _set_p_hash_kind(self, hashkind):
         """
         Setting the hash (as an object) used for futur operation on it.
@@ -173,9 +176,7 @@ class Paste(object):
         ..seealso: git@github.com:saffsd/langid.py.git
 
         """
-
         identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
-
         return identifier.classify(self.get_p_content())
 
     def _get_p_hash_kind(self):
@@ -184,7 +185,7 @@ class Paste(object):
     def _get_p_date(self):
         return self.p_date
 
-    def _get_hash_lines(self, min = 1, start = 1, jump = 10):
+    def _get_hash_lines(self, min=1, start=1, jump=10):
         """
         Returning all the lines of the paste hashed.
 
@@ -210,20 +211,17 @@ class Paste(object):
 
         """
         S = set([])
-        with gzip.open(self.p_path, 'rb') as F:
-
-            for num, line in enumerate(F, start):
-
-                if len(line) >= min:
-                    if jump > 1:
-                        if (num % jump) == 1 :
-                            S.add(self.p_hash_kind.Calculate(line))
-                    else:
+        f = self.get_p_content_as_file()
+        for num, line in enumerate(f, start):
+            if len(line) >= min:
+                if jump > 1:
+                    if (num % jump) == 1:
                         S.add(self.p_hash_kind.Calculate(line))
+                else:
+                    S.add(self.p_hash_kind.Calculate(line))
         return S
 
-
-    def is_duplicate(self, obj, min = 1, percent = 50, start = 1, jump = 10):
+    def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
         """
         Returning the percent of similarity with another paste.
         ( Using the previous hashing method )
@@ -264,8 +262,7 @@ class Paste(object):
         else:
             return False, var
 
-
-    def save_all_attributes_redis(self, r_serv, key = None):
+    def save_all_attributes_redis(self, r_serv, key=None):
         """
         Saving all the attributes in a "Redis-like" Database (Redis, LevelDB)
 
@@ -281,23 +278,25 @@ class Paste(object):
         PST.save_all_attributes_redis(r_serv)
 
         """
-        #LevelDB Compatibility
-        r_serv.hset(self.p_path, "p_name", self.p_name)
-        r_serv.hset(self.p_path, "p_size", self.p_size)
-        r_serv.hset(self.p_path, "p_mime", self.p_mime)
-        #r_serv.hset(self.p_path, "p_encoding", self.p_encoding)
-        r_serv.hset(self.p_path, "p_date", self._get_p_date())
-        r_serv.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind())
-        r_serv.hset(self.p_path, "p_hash", self.p_hash)
-        #r_serv.hset(self.p_path, "p_langage", self.p_langage)
-        #r_serv.hset(self.p_path, "p_nb_lines", self.p_nb_lines)
-        #r_serv.hset(self.p_path, "p_max_length_line", self.p_max_length_line)
-        #r_serv.hset(self.p_path, "p_categories", self.p_categories)
-        r_serv.hset(self.p_path, "p_source", self.p_source)
-        if key != None:
-            r_serv.sadd(key, self.p_path)
+        # LevelDB Compatibility
+        p = r_serv.pipeline(False)
+        p.hset(self.p_path, "p_name", self.p_name)
+        p.hset(self.p_path, "p_size", self.p_size)
+        p.hset(self.p_path, "p_mime", self.p_mime)
+        # p.hset(self.p_path, "p_encoding", self.p_encoding)
+        p.hset(self.p_path, "p_date", self._get_p_date())
+        p.hset(self.p_path, "p_hash_kind", self._get_p_hash_kind())
+        p.hset(self.p_path, "p_hash", self.p_hash)
+        # p.hset(self.p_path, "p_langage", self.p_langage)
+        # p.hset(self.p_path, "p_nb_lines", self.p_nb_lines)
+        # p.hset(self.p_path, "p_max_length_line", self.p_max_length_line)
+        # p.hset(self.p_path, "p_categories", self.p_categories)
+        p.hset(self.p_path, "p_source", self.p_source)
+        if key is not None:
+            p.sadd(key, self.p_path)
         else:
             pass
+        p.execute()
 
     def save_attribute_redis(self, r_serv, attr_name, value):
         """
@@ -308,11 +307,10 @@ class Paste(object):
         else:
             r_serv.hset(self.p_path, attr_name, json.dumps(value))
 
-    def _get_from_redis(self,r_serv):
+    def _get_from_redis(self, r_serv):
         return r_serv.hgetall(self.p_hash)
 
-
-    def _get_top_words(self, sort = False):
+    def _get_top_words(self, sort=False):
         """
         Tokenising method: Returning a sorted list or a set of paste's words
 
@@ -325,28 +323,23 @@ class Paste(object):
         """
         words = {}
         tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
-        gaps = True,
-        discard_empty = True)
+                                    gaps=True, discard_empty=True)
 
-        blob = TextBlob(clean(self.get_p_content()),
-        tokenizer = tokenizer)
+        blob = TextBlob(clean(self.get_p_content()), tokenizer=tokenizer)
 
         for word in blob.tokens:
             if word in words.keys():
                 num = words[word]
             else:
                 num = 0
-
             words[word] = num + 1
-
         if sort:
-            var = sorted(words.iteritems(), key = operator.itemgetter(1), reverse = True)
+            var = sorted(words.iteritems(), key=operator.itemgetter(1), reverse=True)
         else:
             var = words
 
         return var
 
-
     def _get_word(self, word):
         """
         Returning a specific word and his occurence if present in the paste
@@ -358,7 +351,6 @@ class Paste(object):
         """
         return [item for item in self._get_top_words() if item[0] == word]
 
-
     def get_regex(self, regex):
         """
         Returning matches with the regex given as an argument.