AIL-framework/bin/Web.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*

"""
The Web Module
============================

This module tries to parse URLs and warns if some defined contry code are present.

"""

import redis
import pprint
import time
import os
import dns.exception
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from pyfaup.faup import Faup
import re

# Country and ASN lookup
from cymru.ip2asn.dns import DNSClient as ip2asn
import socket
import pycountry
import ipaddress

from Helper import Process

# Used to prevent concat with empty fields due to url parsing
def avoidNone(a_string):
    if a_string is None:
        return ""
    else:
        return a_string

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Web'

    p = Process(config_section)

    # REDIS #
    r_serv2 = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"),
        decode_responses=True)

    # Protocol file path
    protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
                         p.config.get("Directories", "protocolsfile"))

    # Country to log as critical
    cc_critical = p.config.get("Url", "cc_critical")

    # FUNCTIONS #
    publisher.info("Script URL subscribed to channel web_categ")

    # FIXME For retro compatibility
    channel = 'web_categ'

    message = p.get_from_set()
    prec_filename = None
    faup = Faup()

    # Get all uri from protocolsfile (Used for Curve)
    uri_scheme = ""
    with open(protocolsfile_path, 'r') as scheme_file:
        for scheme in scheme_file:
            uri_scheme += scheme[:-1]+"|"
    uri_scheme = uri_scheme[:-1]

    url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"

    while True:
        if message is not None:
            filename, score = message.split()

            if prec_filename is None or filename != prec_filename:
                domains_list = []
                PST = Paste.Paste(filename)
                client = ip2asn()
                for x in PST.get_regex(url_regex):
                    matching_url = re.search(url_regex, PST.get_p_content())
                    url = matching_url.group(0)

                    to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
                    p.populate_set_out(to_send, 'Url')

                    faup.decode(url)
                    domain = faup.get_domain()
                    subdomain = faup.get_subdomain()

                    publisher.debug('{} Published'.format(url))

                    if subdomain is not None:
                        ## TODO: # FIXME: remove me
                        try:
                            subdomain = subdomain.decode()
                        except:
                            pass

                    if domain is not None:
                        ## TODO: # FIXME: remove me
                        try:
                            domain = domain.decode()
                        except:
                            pass
                        domains_list.append(domain)

                    hostl = avoidNone(subdomain) + avoidNone(domain)

                    try:
                        socket.setdefaulttimeout(1)
                        ip = socket.gethostbyname(hostl)
                    except:
                        # If the resolver is not giving any IPv4 address,
                        # ASN/CC lookup is skip.
                        continue

                    try:
                        l = client.lookup(ip, qType='IP')

                    except ipaddress.AddressValueError:
                        continue
                    cc = getattr(l, 'cc')
                    asn = ''
                    if getattr(l, 'asn') is not None:
                        asn = getattr(l, 'asn')[2:] #remobe b'

                    # EU is not an official ISO 3166 code (but used by RIPE
                    # IP allocation)
                    if cc is not None and cc != "EU":
                        print(hostl, asn, cc, \
                            pycountry.countries.get(alpha_2=cc).name)
                        if cc == cc_critical:
                            to_print = 'Url;{};{};{};Detected {} {}'.format(
                                    PST.p_source, PST.p_date, PST.p_name,
                                    hostl, cc)
                            #publisher.warning(to_print)
                            print(to_print)
                    else:
                        print(hostl, asn, cc)

                A_values = lib_refine.checking_A_record(r_serv2,
                                                        domains_list)

                if A_values[0] >= 1:
                    PST.__setattr__(channel, A_values)
                    PST.save_attribute_redis(channel, (A_values[0],
                                             list(A_values[1])))


                    pprint.pprint(A_values)
                    publisher.info('Url;{};{};{};Checked {} URL;{}'.format(
                        PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))
            prec_filename = filename

        else:
            publisher.debug("Script url is Idling 10s")
            print('Sleeping')
            time.sleep(10)

        message = p.get_from_set()
decode with redis connection 2018-05-04 13:53:29 +02:00			`#!/usr/bin/env python3`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`# --coding:UTF-8 -`
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00
			`"""`
			`The Web Module`
			`============================`

			`This module tries to parse URLs and warns if some defined contry code are present.`

			`"""`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`import redis`
			`import pprint`
			`import time`
Bug fix related with redis: Fixed typo key in redis for module creditcard and sqlinjection Modified Curve redisLvlDb server Modified Url.py so that it forwards name of protocol from saved protocolsfile Added Cache control in Flask Modified key-tab name into keys-tab 2016-08-09 11:59:36 +02:00			`import os`
Importing dns.exeption fix #4 fix #7 2014-08-11 09:27:50 +02:00			`import dns.exception`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`from packages import Paste`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`from packages import lib_refine`
			`from pubsublogger import publisher`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`from pyfaup.faup import Faup`
			`import re`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Log where URLs are hosted - cc_critical option added It logs where the hostname of the URL is hosted (ASN and geographic location). A simple option cc_critical added to set the country code to log as critical. 2014-08-14 14:22:11 +02:00			`# Country and ASN lookup`
			`from cymru.ip2asn.dns import DNSClient as ip2asn`
			`import socket`
			`import pycountry`
			`import ipaddress`

Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`from Helper import Process`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Added few comments 2016-07-04 09:18:23 +02:00			`# Used to prevent concat with empty fields due to url parsing`
modified variable name str 2016-07-12 11:52:19 +02:00			`def avoidNone(a_string):`
			`if a_string is None:`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`return ""`
			`else:`
modified variable name str 2016-07-12 11:52:19 +02:00			`return a_string`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`if __name__ == "__main__":`
Small fixes to make the refactoring production ready * the port for the logging is 6380 * use os.environ properly * fix typos 2014-08-22 17:35:40 +02:00			`publisher.port = 6380`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`publisher.channel = "Script"`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`config_section = 'Web'`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`p = Process(config_section)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`# REDIS #`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`r_serv2 = redis.StrictRedis(`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`host=p.config.get("Redis_Cache", "host"),`
			`port=p.config.getint("Redis_Cache", "port"),`
decode with redis connection 2018-05-04 13:53:29 +02:00			`db=p.config.getint("Redis_Cache", "db"),`
			`decode_responses=True)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Bug fix related with redis: Fixed typo key in redis for module creditcard and sqlinjection Modified Curve redisLvlDb server Modified Url.py so that it forwards name of protocol from saved protocolsfile Added Cache control in Flask Modified key-tab name into keys-tab 2016-08-09 11:59:36 +02:00			`# Protocol file path`
			`protocolsfile_path = os.path.join(os.environ['AIL_HOME'],`
			`p.config.get("Directories", "protocolsfile"))`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Country to log as critical`
More cleanup 2014-09-05 10:42:01 +02:00			`cc_critical = p.config.get("Url", "cc_critical")`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`# FUNCTIONS #`
			`publisher.info("Script URL subscribed to channel web_categ")`

Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`# FIXME For retro compatibility`
			`channel = 'web_categ'`

			`message = p.get_from_set()`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`prec_filename = None`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`faup = Faup()`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Bug fix related with redis: Fixed typo key in redis for module creditcard and sqlinjection Modified Curve redisLvlDb server Modified Url.py so that it forwards name of protocol from saved protocolsfile Added Cache control in Flask Modified key-tab name into keys-tab 2016-08-09 11:59:36 +02:00			`# Get all uri from protocolsfile (Used for Curve)`
			`uri_scheme = ""`
			`with open(protocolsfile_path, 'r') as scheme_file:`
			`for scheme in scheme_file:`
			`uri_scheme += scheme[:-1]+"\|"`
			`uri_scheme = uri_scheme[:-1]`

			url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)@)((25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9])\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[0-9])\|localhost\|([a-zA-Z0-9\-]+\.)[a-zA-Z0-9\-]+\.(com\|edu\|gov\|int\|mil\|net\|org\|biz\|arpa\|info\|name\|pro\|aero\|coop\|museum\|[a-zA-Z]{2}))(\:[0-9]+)(/($\|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`while True:`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`if message is not None:`
Categ now listen to the Global queue 2014-09-05 17:05:45 +02:00			`filename, score = message.split()`
Fix the exceptions 2014-09-04 11:46:07 +02:00
			`if prec_filename is None or filename != prec_filename:`
			`domains_list = []`
			`PST = Paste.Paste(filename)`
			`client = ip2asn()`
			`for x in PST.get_regex(url_regex):`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`matching_url = re.search(url_regex, PST.get_p_content())`
			`url = matching_url.group(0)`

Added SQLInjectionDetection module 2016-08-02 15:43:11 +02:00			`to_send = "{} {} {}".format(url, PST._get_p_date(), filename)`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`p.populate_set_out(to_send, 'Url')`

			`faup.decode(url)`
			`domain = faup.get_domain()`
			`subdomain = faup.get_subdomain()`

			`publisher.debug('{} Published'.format(url))`
Fix the exceptions 2014-09-04 11:46:07 +02:00
fix python3 types issue 2018-04-16 17:00:44 +02:00			`if subdomain is not None:`
fix: [faup] fix new return types (bytes to str) 2019-05-06 13:38:13 +02:00			`## TODO: # FIXME: remove me`
			`try:`
			`subdomain = subdomain.decode()`
			`except:`
			`pass`
fix python3 types issue 2018-04-16 17:00:44 +02:00
			`if domain is not None:`
fix: [faup] fix new return types (bytes to str) 2019-05-06 13:38:13 +02:00			`## TODO: # FIXME: remove me`
			`try:`
			`domain = domain.decode()`
			`except:`
			`pass`
fix python3 types issue 2018-04-16 17:00:44 +02:00			`domains_list.append(domain)`

			`hostl = avoidNone(subdomain) + avoidNone(domain)`

Fix the exceptions 2014-09-04 11:46:07 +02:00			`try:`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`socket.setdefaulttimeout(1)`
fix python3 types issue 2018-04-16 17:00:44 +02:00			`ip = socket.gethostbyname(hostl)`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`except:`
			`# If the resolver is not giving any IPv4 address,`
			`# ASN/CC lookup is skip.`
			`continue`

			`try:`
			`l = client.lookup(ip, qType='IP')`
fix python3 types issue 2018-04-16 17:00:44 +02:00
Fix the exceptions 2014-09-04 11:46:07 +02:00			`except ipaddress.AddressValueError:`
			`continue`
			`cc = getattr(l, 'cc')`
add: Decoder Module, decode binary, hex and base64 2018-07-19 16:50:42 +02:00			`asn = ''`
fix python3 types issue 2018-04-16 17:00:44 +02:00			`if getattr(l, 'asn') is not None:`
			`asn = getattr(l, 'asn')[2:] #remobe b'`
Fix the exceptions 2014-09-04 11:46:07 +02:00
			`# EU is not an official ISO 3166 code (but used by RIPE`
			`# IP allocation)`
			`if cc is not None and cc != "EU":`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`print(hostl, asn, cc, \`
			`pycountry.countries.get(alpha_2=cc).name)`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`if cc == cc_critical:`
Added warning_paste module and created related webpages. Fixed a Faup bug in credential (multiple instanciation) and added correc populate_set_out in concerned modules (creditcard, credential, ...). Linked browse_warning_paste module and Flask function with redis (created new sets). 2016-08-08 09:17:44 +02:00			`to_print = 'Url;{};{};{};Detected {} {}'.format(`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`PST.p_source, PST.p_date, PST.p_name,`
Added warning_paste module and created related webpages. Fixed a Faup bug in credential (multiple instanciation) and added correc populate_set_out in concerned modules (creditcard, credential, ...). Linked browse_warning_paste module and Flask function with redis (created new sets). 2016-08-08 09:17:44 +02:00			`hostl, cc)`
			`#publisher.warning(to_print)`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`print(to_print)`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`else:`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`print(hostl, asn, cc)`
Fix the exceptions 2014-09-04 11:46:07 +02:00
			`A_values = lib_refine.checking_A_record(r_serv2,`
			`domains_list)`
fix python3 types issue 2018-04-16 17:00:44 +02:00
Fix the exceptions 2014-09-04 11:46:07 +02:00			`if A_values[0] >= 1:`
			`PST.__setattr__(channel, A_values)`
			`PST.save_attribute_redis(channel, (A_values[0],`
			`list(A_values[1])))`

fix python3 types issue 2018-04-16 17:00:44 +02:00
Fix the exceptions 2014-09-04 11:46:07 +02:00			`pprint.pprint(A_values)`
Added support of browsing concerned paste in dashboard for all modules + Show in the page what was the concerned module 2016-10-27 11:50:24 +02:00			`publisher.info('Url;{};{};{};Checked {} URL;{}'.format(`
fix: [Scripts] Remove absolute path 2018-11-02 16:07:27 +01:00			`PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`prec_filename = filename`

			`else:`
			`publisher.debug("Script url is Idling 10s")`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`print('Sleeping')`
Fix the exceptions 2014-09-04 11:46:07 +02:00			`time.sleep(10)`

			`message = p.get_from_set()`