chg: [merge] merge update into advanced_crawler

2019-04-10 16:41:06 +02:00 · 2019-04-10 16:41:06 +02:00 · d44acea04d
parent b4bee29a33 2589fc2161
commit d44acea04d
30 changed files with 126 additions and 102 deletions
--- a/OVERVIEW.md
+++ b/OVERVIEW.md
@ -143,12 +143,30 @@ ARDB_DB

 * DB 3 - Trending
 * DB 4 - Sentiment
+	----------------------------------------- SENTIMENT ------------------------------------
+
+	SET - 'Provider_set'				Provider
+		
+	KEY - 'UniqID' 					INT
+
+	SET - provider_timestamp			UniqID
+
+	SET - UniqID					avg_score
+
 * DB 5 - TermCred
 * DB 6 - Tags
-* DB 7 - Metadata
-* DB 8 - Statistics
+	----------------------------------------------------------------------------------------
+	
+	SET - tag					paste*
+
+	----------------------------------------------------------------------------------------

 * DB 7 - Metadata:
+	----------------------------------------------------------------------------------------
+	
+	SET - 'tag:' + paste				tag
+
+	----------------------------------------------------------------------------------------
 	----------------------------------------- BASE64 ----------------------------------------

 	HSET - 'metadata_hash:'+hash	'saved_path'		saved_path
@ -185,3 +203,9 @@ ARDB_DB

 	GET  - 'base64_decoded:'+date	nd_decoded
 	GET  - 'binary_decoded:'+date	nd_decoded
+
+* DB 8 - Statistics
+* DB 9 - Onion:
+	----------------------------------------------------------------------------------------
+
+	
--- a/bin/ApiKey.py
+++ b/bin/ApiKey.py
@ -40,7 +40,7 @@ def search_api_key(message):
            print('found google api key')
            print(to_print)
            publisher.warning('{}Checked {} found Google API Key;{}'.format(
-                to_print, len(google_api_key), paste.p_path))
+                to_print, len(google_api_key), paste.p_rel_path))
            msg = 'infoleak:automatic-detection="google-api-key";{}'.format(filename)
            p.populate_set_out(msg, 'Tags')

@ -49,7 +49,7 @@ def search_api_key(message):
            print(to_print)
            total = len(aws_access_key) + len(aws_secret_key)
            publisher.warning('{}Checked {} found AWS Key;{}'.format(
-                to_print, total, paste.p_path))
+                to_print, total, paste.p_rel_path))
            msg = 'infoleak:automatic-detection="aws-key";{}'.format(filename)
            p.populate_set_out(msg, 'Tags')

--- a/bin/Attributes.py
+++ b/bin/Attributes.py
@ -43,8 +43,8 @@ if __name__ == "__main__":
            # FIXME why not all saving everything there.
            PST.save_all_attributes_redis()
            # FIXME Not used.
-            PST.store.sadd("Pastes_Objects", PST.p_path)
+            PST.store.sadd("Pastes_Objects", PST.p_rel_path)
        except IOError:
-            print("CRC Checksum Failed on :", PST.p_path)
+            print("CRC Checksum Failed on :", PST.p_rel_path)
            publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
                PST.p_source, PST.p_date, PST.p_name))
--- a/bin/BankAccount.py
+++ b/bin/BankAccount.py
@ -67,7 +67,7 @@ def check_all_iban(l_iban, paste, filename):
    if(nb_valid_iban > 0):
        to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name)
        publisher.warning('{}Checked found {} IBAN;{}'.format(
-            to_print, nb_valid_iban, paste.p_path))
+            to_print, nb_valid_iban, paste.p_rel_path))
        msg = 'infoleak:automatic-detection="iban";{}'.format(filename)
        p.populate_set_out(msg, 'Tags')

@ -113,7 +113,7 @@ if __name__ == "__main__":
            try:
                l_iban = iban_regex.findall(content)
            except TimeoutException:
-                 print ("{0} processing timeout".format(paste.p_path))
+                 print ("{0} processing timeout".format(paste.p_rel_path))
                 continue
            else:
                signal.alarm(0)
--- a/bin/Bitcoin.py
+++ b/bin/Bitcoin.py
@ -73,7 +73,7 @@ def search_key(content, message, paste):
                to_print = 'Bitcoin;{};{};{};'.format(paste.p_source, paste.p_date,
                                                    paste.p_name)
                publisher.warning('{}Detected {} Bitcoin private key;{}'.format(
-                    to_print, len(bitcoin_private_key),paste.p_path))
+                    to_print, len(bitcoin_private_key),paste.p_rel_path))

 if __name__ == "__main__":
    publisher.port = 6380
--- a/bin/Categ.py
+++ b/bin/Categ.py
@ -89,16 +89,10 @@ if __name__ == "__main__":
        paste = Paste.Paste(filename)
        content = paste.get_p_content()

-        #print('-----------------------------------------------------')
-        #print(filename)
-        #print(content)
-        #print('-----------------------------------------------------')
-
        for categ, pattern in tmp_dict.items():
            found = set(re.findall(pattern, content))
            if len(found) >= matchingThreshold:
-                msg = '{} {}'.format(paste.p_path, len(found))
-                #msg = " ".join( [paste.p_path, bytes(len(found))] )
+                msg = '{} {}'.format(paste.p_rel_path, len(found))

                print(msg, categ)
                p.populate_set_out(msg, categ)
@ -106,4 +100,4 @@ if __name__ == "__main__":
                publisher.info(
                    'Categ;{};{};{};Detected {} as {};{}'.format(
                        paste.p_source, paste.p_date, paste.p_name,
-                        len(found), categ, paste.p_path))
+                        len(found), categ, paste.p_rel_path))
--- a/bin/Credential.py
+++ b/bin/Credential.py
@ -97,7 +97,7 @@ if __name__ == "__main__":
        if sites_set:
            message += ' Related websites: {}'.format( (', '.join(sites_set)) )

-        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path)
+        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path)

        print('\n '.join(creds))

--- a/bin/CreditCards.py
+++ b/bin/CreditCards.py
@ -77,16 +77,16 @@ if __name__ == "__main__":
                    paste.p_source, paste.p_date, paste.p_name)
                if (len(creditcard_set) > 0):
                    publisher.warning('{}Checked {} valid number(s);{}'.format(
-                        to_print, len(creditcard_set), paste.p_path))
+                        to_print, len(creditcard_set), paste.p_rel_path))
                    print('{}Checked {} valid number(s);{}'.format(
-                        to_print, len(creditcard_set), paste.p_path))
+                        to_print, len(creditcard_set), paste.p_rel_path))
                    #Send to duplicate
                    p.populate_set_out(filename, 'Duplicate')

                    msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename)
                    p.populate_set_out(msg, 'Tags')
                else:
-                    publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path))
+                    publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_rel_path))
        else:
            publisher.debug("Script creditcard is idling 1m")
            time.sleep(10)
--- a/bin/Decoder.py
+++ b/bin/Decoder.py
@ -226,7 +226,7 @@ if __name__ == '__main__':
            except TimeoutException:
                encoded_list = []
                p.incr_module_timeout_statistic() # add encoder type
-                print ("{0} processing timeout".format(paste.p_path))
+                print ("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)
--- a/bin/DomClassifier.py
+++ b/bin/DomClassifier.py
@ -54,14 +54,14 @@ def main():
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
-                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path))
+                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_rel_path))
                localizeddomains = c.localizedomain(cc=cc)
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
-                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path))
+                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_rel_path))
        except IOError:
-            print("CRC Checksum Failed on :", PST.p_path)
+            print("CRC Checksum Failed on :", PST.p_rel_path)
            publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
                PST.p_source, PST.p_date, PST.p_name))

--- a/bin/Duplicates.py
+++ b/bin/Duplicates.py
@ -142,17 +142,17 @@ if __name__ == "__main__":
                                paste_date = paste_date
                                paste_date = paste_date if paste_date != None else "No date available"
                                if paste_path != None:
-                                    if paste_path != PST.p_path:
+                                    if paste_path != PST.p_rel_path:
                                        hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date)

-                                        print('['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent))
+                                        print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + '  and  ' + str(paste_path) + ' percentage: ' + str(percent))

                        except Exception:
                            print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash)

            # Add paste in DB after checking to prevent its analysis twice
            # hash_type_i -> index_i  AND  index_i -> PST.PATH
-            r_serv1.set(index, PST.p_path)
+            r_serv1.set(index, PST.p_rel_path)
            r_serv1.set(index+'_date', PST._get_p_date())
            r_serv1.sadd("INDEX", index)
            # Adding hashes in Redis
@ -180,7 +180,7 @@ if __name__ == "__main__":
                    PST.__setattr__("p_duplicate", dupl)
                    PST.save_attribute_duplicate(dupl)
                    PST.save_others_pastes_attribute_duplicate(dupl)
-                    publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path))
+                    publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path))
                    print('{}Detected {}'.format(to_print, len(dupl)))
                    print('')

@ -191,5 +191,5 @@ if __name__ == "__main__":
        except IOError:
            to_print = 'Duplicate;{};{};{};'.format(
                PST.p_source, PST.p_date, PST.p_name)
-            print("CRC Checksum Failed on :", PST.p_path)
+            print("CRC Checksum Failed on :", PST.p_rel_path)
            publisher.error('{}CRC Checksum Failed'.format(to_print))
--- a/bin/Global.py
+++ b/bin/Global.py
@ -45,6 +45,8 @@ if __name__ == '__main__':

    p = Process(config_section)

+    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
+
    # LOGGING #
    publisher.info("Feed Script started to receive & publish.")

@ -78,8 +80,7 @@ if __name__ == '__main__':
            paste = rreplace(paste, file_name_paste, new_file_name_paste, 1)

        # Creating the full filepath
-        filename = os.path.join(os.environ['AIL_HOME'],
-                                p.config.get("Directories", "pastes"), paste)
+        filename = os.path.join(PASTES_FOLDER, paste)

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
@ -102,6 +103,7 @@ if __name__ == '__main__':
            print(filename)
            print(type)
            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
-            '''
-        p.populate_set_out(filename)
+        '''
+
+        p.populate_set_out(paste)
        processed_paste+=1
--- a/bin/LibInjection.py
+++ b/bin/LibInjection.py
@ -47,7 +47,7 @@ def analyse(url, path):
        paste = Paste.Paste(path)
        print("Detected (libinjection) SQL in URL: ")
        print(urllib.request.unquote(url))
-        to_print = 'LibInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_path)
+        to_print = 'LibInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_rel_path)
        publisher.warning(to_print)
        #Send to duplicate
        p.populate_set_out(path, 'Duplicate')
--- a/bin/Lines.py
+++ b/bin/Lines.py
@ -75,10 +75,11 @@ if __name__ == '__main__':
            PST.save_attribute_redis("p_max_length_line", lines_infos[1])

            # FIXME Not used.
-            PST.store.sadd("Pastes_Objects", PST.p_path)
+            PST.store.sadd("Pastes_Objects", PST.p_rel_path)
+            print(PST.p_rel_path)
            if lines_infos[1] < args.max:
-                p.populate_set_out( PST.p_path , 'LinesShort')
+                p.populate_set_out( PST.p_rel_path , 'LinesShort')
            else:
-                p.populate_set_out( PST.p_path , 'LinesLong')
+                p.populate_set_out( PST.p_rel_path , 'LinesLong')
        except IOError:
-            print("CRC Checksum Error on : ", PST.p_path)
+            print("CRC Checksum Error on : ", PST.p_rel_path)
--- a/bin/Mail.py
+++ b/bin/Mail.py
@ -78,7 +78,7 @@ if __name__ == "__main__":

                    to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\
                        format(PST.p_source, PST.p_date, PST.p_name,
-                               MX_values[0], PST.p_path)
+                               MX_values[0], PST.p_rel_path)
                    if MX_values[0] > is_critical:
                        publisher.warning(to_print)
                        #Send to duplicate
--- a/bin/Mixer.py
+++ b/bin/Mixer.py
@ -82,6 +82,8 @@ if __name__ == '__main__':
    ttl_key = cfg.getint("Module_Mixer", "ttl_duplicate")
    default_unnamed_feed_name = cfg.get("Module_Mixer", "default_unnamed_feed_name")

+    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + '/'
+
    # STATS #
    processed_paste = 0
    processed_paste_per_feeder = {}
@ -104,12 +106,14 @@ if __name__ == '__main__':
                    feeder_name.replace(" ","")
                    if 'import_dir' in feeder_name:
                        feeder_name = feeder_name.split('/')[1]
-                    paste_name = complete_paste

                except ValueError as e:
                    feeder_name = default_unnamed_feed_name
                    paste_name = complete_paste

+                # remove absolute path
+                paste_name = paste_name.replace(PASTES_FOLDER, '', 1)
+
                # Processed paste
                processed_paste += 1
                try:
@ -119,6 +123,7 @@ if __name__ == '__main__':
                    processed_paste_per_feeder[feeder_name] = 1
                    duplicated_paste_per_feeder[feeder_name] = 0

+
                relay_message = "{0} {1}".format(paste_name, gzip64encoded)
                #relay_message = b" ".join( [paste_name, gzip64encoded] )

--- a/bin/Onion.py
+++ b/bin/Onion.py
@ -167,7 +167,7 @@ if __name__ == "__main__":
                except TimeoutException:
                    encoded_list = []
                    p.incr_module_timeout_statistic()
-                    print ("{0} processing timeout".format(PST.p_path))
+                    print ("{0} processing timeout".format(PST.p_rel_path))
                    continue

                signal.alarm(0)
@ -185,7 +185,7 @@ if __name__ == "__main__":
                            r_onion.sadd('i2p_domain', domain)
                            r_onion.sadd('i2p_link', url)
                            r_onion.sadd('i2p_domain_crawler_queue', domain)
-                            msg = '{};{}'.format(url,PST.p_path)
+                            msg = '{};{}'.format(url,PST.p_rel_path)
                            r_onion.sadd('i2p_crawler_queue', msg)
                '''

@ -200,10 +200,10 @@ if __name__ == "__main__":

                    if not activate_crawler:
                        publisher.warning('{}Detected {} .onion(s);{}'.format(
-                            to_print, len(domains_list),PST.p_path))
+                            to_print, len(domains_list),PST.p_rel_path))
                    else:
                        publisher.info('{}Detected {} .onion(s);{}'.format(
-                            to_print, len(domains_list),PST.p_path))
+                            to_print, len(domains_list),PST.p_rel_path))
                    now = datetime.datetime.now()
                    path = os.path.join('onions', str(now.year).zfill(4),
                                        str(now.month).zfill(2),
@ -232,7 +232,7 @@ if __name__ == "__main__":
                                if not r_onion.sismember('onion_domain_crawler_queue', domain):
                                    print('send to onion crawler')
                                    r_onion.sadd('onion_domain_crawler_queue', domain)
-                                    msg = '{};{}'.format(url,PST.p_path)
+                                    msg = '{};{}'.format(url,PST.p_rel_path)
                                    if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'):
                                        r_onion.sadd('onion_crawler_priority_queue', msg)
                                        print('send to priority queue')
@ -242,13 +242,13 @@ if __name__ == "__main__":

                    else:
                        for url in fetch(p, r_cache, urls, domains_list, path):
-                            publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
+                            publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))

                    # TAG Item
-                    msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
+                    msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
                    p.populate_set_out(msg, 'Tags')
                else:
-                    publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
+                    publisher.info('{}Onion related;{}'.format(to_print, PST.p_rel_path))

            prec_filename = filename
        else:
--- a/bin/RegexForTermsFrequency.py
+++ b/bin/RegexForTermsFrequency.py
@ -108,7 +108,7 @@ if __name__ == "__main__":
                try:
                    matched = compiled_regex.search(content)
                except TimeoutException:
-                    print ("{0} processing timeout".format(paste.p_path))
+                    print ("{0} processing timeout".format(paste.p_rel_path))
                    continue
                else:
                    signal.alarm(0)
--- a/bin/Release.py
+++ b/bin/Release.py
@ -54,7 +54,7 @@ if __name__ == "__main__":
            if len(releases) == 0:
                continue

-                to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path)
+                to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_rel_path)
                print(to_print)
                if len(releases) > 30:
                    publisher.warning(to_print)
@ -63,7 +63,7 @@ if __name__ == "__main__":

        except TimeoutException:
            p.incr_module_timeout_statistic()
-            print ("{0} processing timeout".format(paste.p_path))
+            print ("{0} processing timeout".format(paste.p_rel_path))
            continue
        else:
            signal.alarm(0)
--- a/bin/SQLInjectionDetection.py
+++ b/bin/SQLInjectionDetection.py
@ -78,7 +78,7 @@ def analyse(url, path):
        if (result_path > 1) or (result_query > 1):
            print("Detected SQL in URL: ")
            print(urllib.request.unquote(url))
-            to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_path)
+            to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_rel_path)
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(path, 'Duplicate')
@ -95,7 +95,7 @@ def analyse(url, path):
        else:
            print("Potential SQL injection:")
            print(urllib.request.unquote(url))
-            to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Potential SQL injection", paste.p_path)
+            to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Potential SQL injection", paste.p_rel_path)
            publisher.info(to_print)


--- a/bin/SentimentAnalysis.py
+++ b/bin/SentimentAnalysis.py
@ -45,6 +45,7 @@ cfg = configparser.ConfigParser()
 cfg.read(configfile)

 sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file")
+#time_clean_sentiment_db = 60*60

 def Analyse(message, server):
    path = message
@ -157,9 +158,16 @@ if __name__ == '__main__':
        db=p.config.get("ARDB_Sentiment", "db"),
        decode_responses=True)

+    time1 = time.time()
+
    while True:
        message = p.get_from_set()
        if message is None:
+            #if int(time.time() - time1) > time_clean_sentiment_db:
+            #    clean_db()
+            #    time1 = time.time()
+            #    continue
+            #else:
            publisher.debug("{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue
--- a/bin/Tokenize.py
+++ b/bin/Tokenize.py
@ -57,11 +57,11 @@ if __name__ == "__main__":
            try:
                for word, score in paste._get_top_words().items():
                    if len(word) >= 4:
-                        msg = '{} {} {}'.format(paste.p_path, word, score)
+                        msg = '{} {} {}'.format(paste.p_rel_path, word, score)
                        p.populate_set_out(msg)
            except TimeoutException:
                p.incr_module_timeout_statistic()
-                print ("{0} processing timeout".format(paste.p_path))
+                print ("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)
--- a/bin/Web.py
+++ b/bin/Web.py
@ -153,7 +153,7 @@ if __name__ == "__main__":

                    pprint.pprint(A_values)
                    publisher.info('Url;{};{};{};Checked {} URL;{}'.format(
-                        PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_path))
+                        PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))
            prec_filename = filename

        else:
--- a/bin/packages/HiddenServices.py
+++ b/bin/packages/HiddenServices.py
@ -158,11 +158,7 @@ class HiddenServices(object):
        if father is None:
            return []
        l_crawled_pastes = []
-        paste_parent = father.replace(self.paste_directory+'/', '')
-        paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
-        ## TODO: # FIXME: remove me
-        paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
-        paste_childrens = paste_childrens | paste_children
+        paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
        for children in paste_childrens:
            if self.domain in children:
                l_crawled_pastes.append(children)
@ -198,14 +194,9 @@ class HiddenServices(object):

        set_domain = set()
        for paste in l_paste:
-            paste_full = paste.replace(self.paste_directory+'/', '')
-            paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_full))
-            ## TODO: # FIXME: remove me
-            paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(paste))
-            paste_childrens = paste_childrens | paste_children
+            paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste))
            for children in paste_childrens:
                if not self.domain in children:
-                    print(children)
                    set_domain.add((children.split('.onion')[0]+'.onion').split('/')[-1])

        return set_domain
@ -215,11 +206,7 @@ class HiddenServices(object):
        if father is None:
            return []
        l_crawled_pastes = []
-        paste_parent = father.replace(self.paste_directory+'/', '')
-        paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
-        ## TODO: # FIXME: remove me
-        paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
-        paste_childrens = paste_childrens | paste_children
+        paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
        for children in paste_childrens:
            if not self.domain in children:
                l_crawled_pastes.append(children)
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@ -82,14 +82,14 @@ class Paste(object):
            db=cfg.getint("ARDB_Metadata", "db"),
            decode_responses=True)

-        PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
-        if PASTES_FOLDER not in p_path:
+        self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
+        if self.PASTES_FOLDER not in p_path:
            self.p_rel_path = p_path
-            p_path = os.path.join(PASTES_FOLDER, p_path)
+            self.p_path = os.path.join(self.PASTES_FOLDER, p_path)
        else:
-            self.p_rel_path = None
+            self.p_path = p_path
+            self.p_rel_path = p_path.replace(self.PASTES_FOLDER+'/', '', 1)

-        self.p_path = p_path
        self.p_name = os.path.basename(self.p_path)
        self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2)
        self.p_mime = magic.from_buffer("test", mime=True)
@ -101,7 +101,7 @@ class Paste(object):

        var = self.p_path.split('/')
        self.p_date = Date(var[-4], var[-3], var[-2])
-        self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name)
+        self.p_date_path = os.path.join(var[-4], var[-3], var[-2], self.p_name)
        self.p_source = var[-5]
        self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0])

@ -296,9 +296,13 @@ class Paste(object):
            return False, var

    def _get_p_duplicate(self):
-        self.p_duplicate = self.store_metadata.smembers('dup:'+self.p_path)
-        if self.p_rel_path is not None:
-            self.p_duplicate.union( self.store_metadata.smembers('dup:'+self.p_rel_path) )
+        p_duplicate = self.store_metadata.smembers('dup:'+self.p_path)
+        # remove absolute path #fix-db
+        if p_duplicate:
+            for duplicate_string in p_duplicate:
+                self.store_metadata.srem('dup:'+self.p_path, duplicate_string)
+                self.store_metadata.sadd('dup:'+self.p_rel_path, duplicate_string.replace(self.PASTES_FOLDER+'/', '', 1))
+        self.p_duplicate = self.store_metadata.smembers('dup:'+self.p_rel_path)
        if self.p_duplicate is not None:
            return list(self.p_duplicate)
        else:
@ -318,6 +322,9 @@ class Paste(object):
    def get_p_rel_path(self):
        return self.p_rel_path

+    def get_p_date_path(self):
+        return self.p_date_path
+
    def save_all_attributes_redis(self, key=None):
        """
        Saving all the attributes in a "Redis-like" Database (Redis, LevelDB)
--- a/var/www/modules/Flask_config.py
+++ b/var/www/modules/Flask_config.py
@ -162,8 +162,7 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']

 UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')

-PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
-PASTES_FOLDERS = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
+PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
 SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))

 max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs"))
--- a/var/www/modules/hashDecoded/Flask_hashDecoded.py
+++ b/var/www/modules/hashDecoded/Flask_hashDecoded.py
@ -25,7 +25,7 @@ baseUrl = Flask_config.baseUrl
 r_serv_metadata = Flask_config.r_serv_metadata
 vt_enabled = Flask_config.vt_enabled
 vt_auth = Flask_config.vt_auth
-PASTES_FOLDER = Flask_config.PASTES_FOLDERS
+PASTES_FOLDER = Flask_config.PASTES_FOLDER

 hashDecoded = Blueprint('hashDecoded', __name__, template_folder='templates')

--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@ -26,7 +26,6 @@ r_cache = Flask_config.r_cache
 r_serv_onion = Flask_config.r_serv_onion
 r_serv_metadata = Flask_config.r_serv_metadata
 bootstrap_label = Flask_config.bootstrap_label
-PASTES_FOLDER = Flask_config.PASTES_FOLDER

 hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates')

@ -579,16 +578,14 @@ def show_domain():
    origin_paste_name = h.get_origin_paste_name()
    origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste)))
    paste_tags = []
-    path_name = []
    for path in l_pastes:
-        path_name.append(path.replace(PASTES_FOLDER+'/', ''))
        p_tags = r_serv_metadata.smembers('tag:'+path)
        paste_tags.append(unpack_paste_tags(p_tags))

    return render_template("showDomain.html", domain=domain, last_check=last_check, first_seen=first_seen,
                            l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label,
                            dict_links=dict_links,
-                            path_name=path_name, origin_paste_tags=origin_paste_tags, status=status,
+                            origin_paste_tags=origin_paste_tags, status=status,
                            origin_paste=origin_paste, origin_paste_name=origin_paste_name,
                            domain_tags=domain_tags, screenshot=screenshot)

@ -599,7 +596,6 @@ def onion_son():
    h = HiddenServices(onion_domain, 'onion')
    l_pastes = h.get_last_crawled_pastes()
    l_son = h.get_domain_son(l_pastes)
-    print(l_son)
    return 'l_son'

 # ============= JSON ==============
--- a/var/www/modules/search/Flask_search.py
+++ b/var/www/modules/search/Flask_search.py
@ -29,7 +29,7 @@ r_serv_metadata = Flask_config.r_serv_metadata
 max_preview_char = Flask_config.max_preview_char
 max_preview_modal = Flask_config.max_preview_modal
 bootstrap_label = Flask_config.bootstrap_label
-
+PASTES_FOLDER = Flask_config.PASTES_FOLDER

 baseindexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path"))
 indexRegister_path = os.path.join(os.environ['AIL_HOME'],
@ -133,8 +133,8 @@ def search():
        query = QueryParser("content", ix.schema).parse("".join(q))
        results = searcher.search_page(query, 1, pagelen=num_elem_to_get)
        for x in results:
-            r.append(x.items()[0][1])
-            path = x.items()[0][1]
+            r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1))
+            path = x.items()[0][1].replace(PASTES_FOLDER, '', 1)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
            content_range = max_preview_char if len(content)>max_preview_char else len(content)-1
@ -208,6 +208,7 @@ def get_more_search_result():
        results = searcher.search_page(query, page_offset, num_elem_to_get)
        for x in results:
            path = x.items()[0][1]
+            path = path.replace(PASTES_FOLDER, '', 1)
            path_array.append(path)
            paste = Paste.Paste(path)
            content = paste.get_p_content()
--- a/var/www/modules/showpaste/Flask_showpaste.py
+++ b/var/www/modules/showpaste/Flask_showpaste.py
@ -41,14 +41,15 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
 # ============ FUNCTIONS ============

 def showpaste(content_range, requested_path):
-    relative_path = None
    if PASTES_FOLDER not in requested_path:
-        relative_path = requested_path
-        requested_path = os.path.join(PASTES_FOLDER, requested_path)
-    # remove old full path
-    #requested_path = requested_path.replace(PASTES_FOLDER, '')
+        # remove full path
+        requested_path_full = os.path.join(requested_path, PASTES_FOLDER)
+    else:
+        requested_path_full = requested_path
+        requested_path = requested_path.replace(PASTES_FOLDER, '', 1)
+
    # escape directory transversal
-    if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER:
+    if os.path.commonprefix((requested_path_full,PASTES_FOLDER)) != PASTES_FOLDER:
        return 'path transversal detected'

    vt_enabled = Flask_config.vt_enabled
@ -124,8 +125,6 @@ def showpaste(content_range, requested_path):
    active_taxonomies = r_serv_tags.smembers('active_taxonomies')

    l_tags = r_serv_metadata.smembers('tag:'+requested_path)
-    if relative_path is not None:
-        l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) )

    #active galaxies
    active_galaxies = r_serv_tags.smembers('active_galaxies')
@ -190,7 +189,7 @@ def showpaste(content_range, requested_path):
        crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
        crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
        crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
-        crawler_metadata['screenshot'] = paste.get_p_rel_path()
+        crawler_metadata['screenshot'] = paste.get_p_date_path()
    else:
        crawler_metadata['get_metadata'] = False

@ -406,6 +405,7 @@ def send_file_to_vt():
    paste = request.form['paste']
    hash = request.form['hash']

+    ## TODO:  # FIXME:  path transversal
    b64_full_path = os.path.join(os.environ['AIL_HOME'], b64_path)
    b64_content = ''
    with open(b64_full_path, 'rb') as f: