AIL-framework/bin/Onion.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The ZMQ_Sub_Onion Module
============================

This module is consuming the Redis-list created by the ZMQ_Sub_Onion_Q Module.

It trying to extract url from paste and returning only ones which are tor
related (.onion)

    ..seealso:: Paste method (get_regex)

..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
the same Subscriber name in both of them.

Requirements
------------

*Need running Redis instances. (Redis)
*Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.

"""
import pprint
import time
from packages import Paste
from pubsublogger import publisher
import datetime
import os
import base64
import subprocess
import redis

from Helper import Process


def fetch(p, r_cache, urls, domains, path):
    failed = []
    downloaded = []
    print len(urls), 'Urls to fetch.'
    for url, domain in zip(urls, domains):
        if r_cache.exists(url) or url in failed:
            continue
        to_fetch = base64.standard_b64encode(url)
        process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
                                   stdout=subprocess.PIPE)
        while process.poll() is None:
            time.sleep(1)

        if process.returncode == 0:
            r_cache.setbit(url, 0, 1)
            r_cache.expire(url, 360000)
            downloaded.append(url)
            tempfile = process.stdout.read().strip()
            with open(tempfile, 'r') as f:
                filename = path + domain + '.gz'
                fetched = f.read()
                content = base64.standard_b64decode(fetched)
                save_path = os.path.join(os.environ['AIL_HOME'],
                                         p.config.get("Directories", "pastes"),
                                         filename)
                dirname = os.path.dirname(save_path)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(save_path, 'w') as ff:
                    ff.write(content)
                p.populate_set_out(save_path, 'Global')
                p.populate_set_out(url, 'ValidOnion')
                p.populate_set_out(fetched, 'FetchedOnion')
                yield url
            os.unlink(tempfile)
        else:
            r_cache.setbit(url, 0, 0)
            r_cache.expire(url, 3600)
            failed.append(url)
            print 'Failed at downloading', url
            print process.stdout.read()
    print 'Failed:', len(failed), 'Downloaded:', len(downloaded)


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    torclient_host = '127.0.0.1'
    torclient_port = 9050

    config_section = 'Onion'

    p = Process(config_section)
    r_cache = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"))

    # FUNCTIONS #
    publisher.info("Script subscribed to channel onion_categ")

    # FIXME For retro compatibility
    channel = 'onion_categ'

    # Getting the first message from redis.
    message = p.get_from_set()
    prec_filename = None

    # Thanks to Faup project for this regex
    # https://github.com/stricaud/faup
    url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"

    while True:
        if message is not None:
            print message
            filename, score = message.split()

            # "For each new paste"
            if prec_filename is None or filename != prec_filename:
                domains_list = []
                urls = []
                PST = Paste.Paste(filename)

                for x in PST.get_regex(url_regex):
                    # Extracting url with regex
                    url, s, credential, subdomain, domain, host, port, \
                        resource_path, query_string, f1, f2, f3, f4 = x

                    domains_list.append(domain)
                    urls.append(url)

                # Saving the list of extracted onion domains.
                PST.__setattr__(channel, domains_list)
                PST.save_attribute_redis(channel, domains_list)
                to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
                                                    PST.p_name)
                if len(domains_list) > 0:

                    publisher.warning('{}Detected {} .onion(s)'.format(
                        to_print, len(domains_list)))
                    now = datetime.datetime.now()
                    path = os.path.join('onions', str(now.year).zfill(4),
                                        str(now.month).zfill(2),
                                        str(now.day).zfill(2),
                                        str(int(time.mktime(now.utctimetuple()))))
                    to_print = 'Onion;{};{};{};'.format(PST.p_source,
                                                        PST.p_date,
                                                        PST.p_name)
                    for url in fetch(p, r_cache, urls, domains_list, path):
                        publisher.warning('{}Checked {}'.format(to_print, url))
                else:
                    publisher.info('{}Onion related'.format(to_print))

            prec_filename = filename
        else:
            publisher.debug("Script url is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
        message = p.get_from_set()
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`
			`"""`
			`The ZMQ_Sub_Onion Module`
			`============================`

			`This module is consuming the Redis-list created by the ZMQ_Sub_Onion_Q Module.`

completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`It trying to extract url from paste and returning only ones which are tor`
			`related (.onion)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`..seealso:: Paste method (get_regex)`

			`..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put`
			`the same Subscriber name in both of them.`

			`Requirements`
			`------------`

			`*Need running Redis instances. (Redis)`
			`*Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.`

			`"""`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`import pprint`
			`import time`
			`from packages import Paste`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`from pubsublogger import publisher`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`import datetime`
			`import os`
			`import base64`
			`import subprocess`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`import redis`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`from Helper import Process`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`def fetch(p, r_cache, urls, domains, path):`
Update starting script. 2014-09-02 15:21:36 +02:00			`failed = []`
fix onions, cc and domain classifier modules 2014-09-08 16:51:43 +02:00			`downloaded = []`
			`print len(urls), 'Urls to fetch.'`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`for url, domain in zip(urls, domains):`
Update starting script. 2014-09-02 15:21:36 +02:00			`if r_cache.exists(url) or url in failed:`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`continue`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`to_fetch = base64.standard_b64encode(url)`
			`process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],`
			`stdout=subprocess.PIPE)`
			`while process.poll() is None:`
			`time.sleep(1)`

			`if process.returncode == 0:`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`r_cache.setbit(url, 0, 1)`
fix onions, cc and domain classifier modules 2014-09-08 16:51:43 +02:00			`r_cache.expire(url, 360000)`
			`downloaded.append(url)`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`tempfile = process.stdout.read().strip()`
			`with open(tempfile, 'r') as f:`
fix onions, cc and domain classifier modules 2014-09-08 16:51:43 +02:00			`filename = path + domain + '.gz'`
Publish the fetched onions on a ZMQ feed. 2014-09-30 16:55:16 +02:00			`fetched = f.read()`
			`content = base64.standard_b64decode(fetched)`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`save_path = os.path.join(os.environ['AIL_HOME'],`
			`p.config.get("Directories", "pastes"),`
			`filename)`
			`dirname = os.path.dirname(save_path)`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
			`with open(save_path, 'w') as ff:`
			`ff.write(content)`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`p.populate_set_out(save_path, 'Global')`
			`p.populate_set_out(url, 'ValidOnion')`
Publish the fetched onions on a ZMQ feed. 2014-09-30 16:55:16 +02:00			`p.populate_set_out(fetched, 'FetchedOnion')`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`yield url`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`os.unlink(tempfile)`
			`else:`
fix onions, cc and domain classifier modules 2014-09-08 16:51:43 +02:00			`r_cache.setbit(url, 0, 0)`
			`r_cache.expire(url, 3600)`
Update starting script. 2014-09-02 15:21:36 +02:00			`failed.append(url)`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`print 'Failed at downloading', url`
			`print process.stdout.read()`
fix onions, cc and domain classifier modules 2014-09-08 16:51:43 +02:00			`print 'Failed:', len(failed), 'Downloaded:', len(downloaded)`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00

completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`if __name__ == "__main__":`
Small fixes to make the refactoring production ready * the port for the logging is 6380 * use os.environ properly * fix typos 2014-08-22 17:35:40 +02:00			`publisher.port = 6380`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`publisher.channel = "Script"`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`torclient_host = '127.0.0.1'`
			`torclient_port = 9050`

Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`config_section = 'Onion'`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`p = Process(config_section)`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`r_cache = redis.StrictRedis(`
			`host=p.config.get("Redis_Cache", "host"),`
			`port=p.config.getint("Redis_Cache", "port"),`
			`db=p.config.getint("Redis_Cache", "db"))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`# FUNCTIONS #`
			`publisher.info("Script subscribed to channel onion_categ")`

Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`# FIXME For retro compatibility`
			`channel = 'onion_categ'`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Getting the first message from redis.`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`message = p.get_from_set()`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`prec_filename = None`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Thanks to Faup project for this regex`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`# https://github.com/stricaud/faup`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`url_regex = "((http\|https\|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)@)((25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9])\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[0-9])\|localhost\|([a-zA-Z0-9\-]+\.)[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)(/($\|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`while True:`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`if message is not None:`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`print message`
Categ now listen to the Global queue 2014-09-05 17:05:45 +02:00			`filename, score = message.split()`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`# "For each new paste"`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`if prec_filename is None or filename != prec_filename:`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`domains_list = []`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`urls = []`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`PST = Paste.Paste(filename)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`for x in PST.get_regex(url_regex):`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Extracting url with regex`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`url, s, credential, subdomain, domain, host, port, \`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`resource_path, query_string, f1, f2, f3, f4 = x`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`domains_list.append(domain)`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`urls.append(url)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Saving the list of extracted onion domains.`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`PST.__setattr__(channel, domains_list)`
move Redis_Data_Merging to Paste 2014-08-21 12:22:07 +02:00			`PST.save_attribute_redis(channel, domains_list)`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,`
			`PST.p_name)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`if len(domains_list) > 0:`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`publisher.warning('{}Detected {} .onion(s)'.format(`
			`to_print, len(domains_list)))`
The onion module now fetches the URLs it finds. 2014-08-31 22:42:12 +02:00			`now = datetime.datetime.now()`
			`path = os.path.join('onions', str(now.year).zfill(4),`
			`str(now.month).zfill(2),`
			`str(now.day).zfill(2),`
			`str(int(time.mktime(now.utctimetuple()))))`
Fix the onion module, log the valid onions. 2014-09-01 16:18:06 +02:00			`to_print = 'Onion;{};{};{};'.format(PST.p_source,`
			`PST.p_date,`
			`PST.p_name)`
			`for url in fetch(p, r_cache, urls, domains_list, path):`
Update starting script. 2014-09-02 15:21:36 +02:00			`publisher.warning('{}Checked {}'.format(to_print, url))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`else:`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`publisher.info('{}Onion related'.format(to_print))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`prec_filename = filename`
			`else:`
			`publisher.debug("Script url is Idling 10s")`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`print 'Sleeping'`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`time.sleep(10)`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`message = p.get_from_set()`