AIL-framework/bin/Credential.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*

"""
The Credential Module
=====================

This module is consuming the Redis-list created by the Categ module.

It apply credential regexes on paste content and warn if above a threshold.

It also split the username and store it into redis for searching purposes.

Redis organization:
    uniqNumForUsername: unique number attached to unique username
    uniqNumForPath: unique number attached to unique path
        -> uniqNum are used to avoid string duplication
    AllCredentials: hashed set where keys are username and value are their uniq number
    AllCredentialsRev: the opposite of AllCredentials, uniqNum -> username
    AllPath: hashed set where keys are path and value are their uniq number
    AllPathRev: the opposite of AllPath, uniqNum -> path
    CredToPathMapping_uniqNumForUsername -> (set) -> uniqNumForPath

"""

import time
import sys
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import datetime
import re
import redis
from pyfaup.faup import Faup

#split username with spec. char or with upper case, distinguish start with upper
REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+"
REDIS_KEY_NUM_USERNAME = 'uniqNumForUsername'
REDIS_KEY_NUM_PATH = 'uniqNumForUsername'
REDIS_KEY_ALL_CRED_SET = 'AllCredentials'
REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev'
REDIS_KEY_ALL_PATH_SET = 'AllPath'
REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev'
REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping'

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold")

    faup = Faup()
    server_cred = redis.StrictRedis(
        host=p.config.get("ARDB_TermCred", "host"),
        port=p.config.get("ARDB_TermCred", "port"),
        db=p.config.get("ARDB_TermCred", "db"),
        decode_responses=True)

    server_statistics = redis.StrictRedis(
        host=p.config.get("ARDB_Statistics", "host"),
        port=p.config.getint("ARDB_Statistics", "port"),
        db=p.config.getint("ARDB_Statistics", "db"),
        decode_responses=True)

    criticalNumberToAlert = p.config.getint("Credential", "criticalNumberToAlert")
    minTopPassList = p.config.getint("Credential", "minTopPassList")

    regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
    #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"
    regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+"
    regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"

    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug("Script Credential is Idling 10s")
            #print('sleeping 10s')
            time.sleep(10)
            continue

        filepath, count = message.split(' ')

        paste = Paste.Paste(filepath)
        content = paste.get_p_content()
        creds = set(re.findall(regex_cred, content))

        if len(creds) == 0:
            continue

        sites= re.findall(regex_web, content) #Use to count occurences
        sites_set = set(re.findall(regex_web, content))

        message = 'Checked {} credentials found.'.format(len(creds))
        if sites_set:
            message += ' Related websites: {}'.format( (', '.join(sites_set)) )

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path)

        print('\n '.join(creds))

        #num of creds above tresh, publish an alert
        if len(creds) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".format( filepath ))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')
            #Send to alertHandler
            msg = 'credential;{}'.format(filepath)
            p.populate_set_out(msg, 'alertHandler')

            msg = 'infoleak:automatic-detection="credential";{}'.format(filepath)
            p.populate_set_out(msg, 'Tags')

            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
                faup.decode(url)
                domain = faup.get()['domain']
                if domain in creds_sites.keys():
                    creds_sites[domain] += 1
                else:
                    creds_sites[domain] = 1

            for site, num in creds_sites.items(): # Send for each different site to moduleStats

                mssg = 'credential;{};{};{}'.format(num, site, paste.p_date)
                print(mssg)
                p.populate_set_out(mssg, 'ModuleStats')

            if sites_set:
                print("=======> Probably on : {}".format(', '.join(sites_set)))

            date = datetime.datetime.now().strftime("%Y%m")
            for cred in creds:
                maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
                faup.decode(maildomains)
                tld = faup.get()['tld']
                server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
        else:
            publisher.info(to_print)
            print('found {} credentials'.format(len(creds)))


        #for searching credential in termFreq
        for cred in creds:
            cred = cred.split('@')[0] #Split to ignore mail address

            #unique number attached to unique path
            uniq_num_path = server_cred.incr(REDIS_KEY_NUM_PATH)
            server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {filepath: uniq_num_path})
            server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: filepath})

            #unique number attached to unique username
            uniq_num_cred = server_cred.hget(REDIS_KEY_ALL_CRED_SET, cred)
            if uniq_num_cred is None: #cred do not exist, create new entries
                uniq_num_cred = server_cred.incr(REDIS_KEY_NUM_USERNAME)
                server_cred.hmset(REDIS_KEY_ALL_CRED_SET, {cred: uniq_num_cred})
                server_cred.hmset(REDIS_KEY_ALL_CRED_SET_REV, {uniq_num_cred: cred})

            #Add the mapping between the credential and the path
            server_cred.sadd(REDIS_KEY_MAP_CRED_TO_PATH+'_'+str(uniq_num_cred), uniq_num_path)

            #Split credentials on capital letters, numbers, dots and so on
            #Add the split to redis, each split point towards its initial credential unique number
            splitedCred = re.findall(REGEX_CRED, cred)
            for partCred in splitedCred:
                if len(partCred) > minimumLengthThreshold:
                    server_cred.sadd(partCred, uniq_num_cred)
decode with redis connection 2018-05-04 13:53:29 +02:00			`#!/usr/bin/env python3`
Add modules Credential and Release 2016-02-05 16:15:09 +01:00			`# --coding:UTF-8 -`
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00
			`"""`
			`The Credential Module`
			`=====================`

			`This module is consuming the Redis-list created by the Categ module.`

			`It apply credential regexes on paste content and warn if above a threshold.`

Draft module + web link 2017-07-18 16:57:15 +02:00			`It also split the username and store it into redis for searching purposes.`

			`Redis organization:`
			`uniqNumForUsername: unique number attached to unique username`
			`uniqNumForPath: unique number attached to unique path`
Added comments 2017-07-20 10:24:48 +02:00			`-> uniqNum are used to avoid string duplication`
Draft module + web link 2017-07-18 16:57:15 +02:00			`AllCredentials: hashed set where keys are username and value are their uniq number`
			`AllCredentialsRev: the opposite of AllCredentials, uniqNum -> username`
			`AllPath: hashed set where keys are path and value are their uniq number`
			`AllPathRev: the opposite of AllPath, uniqNum -> path`
Finished basic link web interface + solved some bugs 2017-07-20 10:04:30 +02:00			`CredToPathMapping_uniqNumForUsername -> (set) -> uniqNumForPath`
Draft module + web link 2017-07-18 16:57:15 +02:00
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00			`"""`

Add modules Credential and Release 2016-02-05 16:15:09 +01:00			`import time`
Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`import sys`
Add modules Credential and Release 2016-02-05 16:15:09 +01:00			`from packages import Paste`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`from pubsublogger import publisher`
Add modules Credential and Release 2016-02-05 16:15:09 +01:00			`from Helper import Process`
chg: [Mail Credential] add tld statistic 2018-07-30 10:19:26 +02:00			`import datetime`
Add modules Credential and Release 2016-02-05 16:15:09 +01:00			`import re`
Draft module + web link 2017-07-18 16:57:15 +02:00			`import redis`
Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`from pyfaup.faup import Faup`
Add modules Credential and Release 2016-02-05 16:15:09 +01:00
Draft module + web link 2017-07-18 16:57:15 +02:00			`#split username with spec. char or with upper case, distinguish start with upper`
			`REGEX_CRED = "[a-z]+\|[A-Z]{3,}\|[A-Z]{1,2}[a-z]+\|[0-9]+"`
			`REDIS_KEY_NUM_USERNAME = 'uniqNumForUsername'`
			`REDIS_KEY_NUM_PATH = 'uniqNumForUsername'`
			`REDIS_KEY_ALL_CRED_SET = 'AllCredentials'`
			`REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev'`
			`REDIS_KEY_ALL_PATH_SET = 'AllPath'`
			`REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev'`
			`REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping'`

Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`if __name__ == "__main__":`
			`publisher.port = 6380`
			`publisher.channel = "Script"`
			`config_section = "Credential"`
			`p = Process(config_section)`
			`publisher.info("Find credentials")`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00
update: Moved filtering operation (thresholds, number of matching in the categ file) in the configuration file. It permits to better control the flow of pastes. Also set default mixer duplicate filtering to 3 (Do not filter) 2017-12-11 17:28:34 +01:00			`minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold")`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00
Added warning_paste module and created related webpages. Fixed a Faup bug in credential (multiple instanciation) and added correc populate_set_out in concerned modules (creditcard, credential, ...). Linked browse_warning_paste module and Flask function with redis (created new sets). 2016-08-08 09:17:44 +02:00			`faup = Faup()`
Draft module + web link 2017-07-18 16:57:15 +02:00			`server_cred = redis.StrictRedis(`
change leveldb to ardb 2018-05-07 14:50:40 +02:00			`host=p.config.get("ARDB_TermCred", "host"),`
			`port=p.config.get("ARDB_TermCred", "port"),`
			`db=p.config.get("ARDB_TermCred", "db"),`
decode with redis connection 2018-05-04 13:53:29 +02:00			`decode_responses=True)`
Finished basic link web interface + solved some bugs 2017-07-20 10:04:30 +02:00
chg: [sglinjection Phone] add tld statistic, fix phone regex 2018-07-30 11:56:50 +02:00			`server_statistics = redis.StrictRedis(`
			`host=p.config.get("ARDB_Statistics", "host"),`
			`port=p.config.getint("ARDB_Statistics", "port"),`
			`db=p.config.getint("ARDB_Statistics", "db"),`
			`decode_responses=True)`

update: Moved filtering operation (thresholds, number of matching in the categ file) in the configuration file. It permits to better control the flow of pastes. Also set default mixer duplicate filtering to 3 (Do not filter) 2017-12-11 17:28:34 +01:00			`criticalNumberToAlert = p.config.getint("Credential", "criticalNumberToAlert")`
			`minTopPassList = p.config.getint("Credential", "minTopPassList")`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00
Fix logging, fix URL regex 2016-02-11 12:19:03 +01:00			`regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`#regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"`
			`regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+"`
Added Draft of ModuleStats + Paste size average 2016-07-25 16:38:57 +02:00			`regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"`
chg: [statistics] add script to generate pie chart png 2018-07-30 16:36:34 +02:00
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`while True:`
Split filepath and count in credential module 2016-02-10 17:31:52 +01:00			`message = p.get_from_set()`
			`if message is None:`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`publisher.debug("Script Credential is Idling 10s")`
decode with redis connection 2018-05-04 13:53:29 +02:00			`#print('sleeping 10s')`
			`time.sleep(10)`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`continue`

python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`filepath, count = message.split(' ')`
Split filepath and count in credential module 2016-02-10 17:31:52 +01:00
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`paste = Paste.Paste(filepath)`
			`content = paste.get_p_content()`
			`creds = set(re.findall(regex_cred, content))`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`if len(creds) == 0:`
			`continue`

Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`sites= re.findall(regex_web, content) #Use to count occurences`
			`sites_set = set(re.findall(regex_web, content))`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00
Fix logging, fix URL regex 2016-02-11 12:19:03 +01:00			`message = 'Checked {} credentials found.'.format(len(creds))`
Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`if sites_set:`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`message += ' Related websites: {}'.format( (', '.join(sites_set)) )`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00
Added support of browsing concerned paste in dashboard for all modules + Show in the page what was the concerned module 2016-10-27 11:50:24 +02:00			`to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path)`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00
			`print('\n '.join(creds))`

Draft module + web link 2017-07-18 16:57:15 +02:00			`#num of creds above tresh, publish an alert`
update: Moved filtering operation (thresholds, number of matching in the categ file) in the configuration file. It permits to better control the flow of pastes. Also set default mixer duplicate filtering to 3 (Do not filter) 2017-12-11 17:28:34 +01:00			`if len(creds) > criticalNumberToAlert:`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`print("========> Found more than 10 credentials in this file : {}".format( filepath ))`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`publisher.warning(to_print)`
Duplicate module takes its messages from other modules and no more from Global. 2016-07-18 16:22:33 +02:00			`#Send to duplicate`
Added Draft of ModuleStats + Paste size average 2016-07-25 16:38:57 +02:00			`p.populate_set_out(filepath, 'Duplicate')`
Renamed BrowseWarningPaste into alertHandler 2017-11-15 16:15:43 +01:00			`#Send to alertHandler`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`msg = 'credential;{}'.format(filepath)`
			`p.populate_set_out(msg, 'alertHandler')`

tags 2018-05-16 14:39:01 +02:00			`msg = 'infoleak:automatic-detection="credential";{}'.format(filepath)`
			`p.populate_set_out(msg, 'Tags')`

Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`#Put in form, count occurences, then send to moduleStats`
Added Draft of ModuleStats + Paste size average 2016-07-25 16:38:57 +02:00			`creds_sites = {}`
Fixed bug in credential (not sending correctly to ModuleStat) + Fixed date bug in ModuleStats 2016-08-08 11:37:18 +02:00			`site_occurence = re.findall(regex_site_for_stats, content)`
			`for site in site_occurence:`
			`site_domain = site[1:-1]`
decode with redis connection 2018-05-04 13:53:29 +02:00			`if site_domain in creds_sites.keys():`
Fixed bug in credential (not sending correctly to ModuleStat) + Fixed date bug in ModuleStats 2016-08-08 11:37:18 +02:00			`creds_sites[site_domain] += 1`
			`else:`
			`creds_sites[site_domain] = 1`

Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`for url in sites:`
			`faup.decode(url)`
			`domain = faup.get()['domain']`
			`if domain in creds_sites.keys():`
			`creds_sites[domain] += 1`
			`else:`
			`creds_sites[domain] = 1`

python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`for site, num in creds_sites.items(): # Send for each different site to moduleStats`
add apiKeys module 2018-04-26 14:42:39 +02:00
python 3 backend upgrade 2018-04-16 14:50:04 +02:00			`mssg = 'credential;{};{};{}'.format(num, site, paste.p_date)`
			`print(mssg)`
			`p.populate_set_out(mssg, 'ModuleStats')`
Added Draft of ModuleStats + Paste size average 2016-07-25 16:38:57 +02:00
Fixed variable bug in moduleStats and added draft of credential stats 2016-07-26 10:45:02 +02:00			`if sites_set:`
			`print("=======> Probably on : {}".format(', '.join(sites_set)))`
chg: [statistics] add script to generate pie chart png 2018-07-30 16:36:34 +02:00
			`date = datetime.datetime.now().strftime("%Y%m")`
			`for cred in creds:`
			`maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]`
			`faup.decode(maildomains)`
			`tld = faup.get()['tld']`
			`server_statistics.hincrby('credential_by_tld:'+date, tld, 1)`
Refactoring on Credential, Phone and Release 2016-02-10 16:39:06 +01:00			`else:`
			`publisher.info(to_print)`
Draft module + web link 2017-07-18 16:57:15 +02:00			`print('found {} credentials'.format(len(creds)))`


Added comments 2017-07-20 10:24:48 +02:00			`#for searching credential in termFreq`
Draft module + web link 2017-07-18 16:57:15 +02:00			`for cred in creds:`
Added comments 2017-07-20 10:24:48 +02:00			`cred = cred.split('@')[0] #Split to ignore mail address`
Draft module + web link 2017-07-18 16:57:15 +02:00
			`#unique number attached to unique path`
Added similarity checks + started link webInterface 2017-07-19 11:52:06 +02:00			`uniq_num_path = server_cred.incr(REDIS_KEY_NUM_PATH)`
Draft module + web link 2017-07-18 16:57:15 +02:00			`server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {filepath: uniq_num_path})`
			`server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: filepath})`

			`#unique number attached to unique username`
			`uniq_num_cred = server_cred.hget(REDIS_KEY_ALL_CRED_SET, cred)`
			`if uniq_num_cred is None: #cred do not exist, create new entries`
			`uniq_num_cred = server_cred.incr(REDIS_KEY_NUM_USERNAME)`
			`server_cred.hmset(REDIS_KEY_ALL_CRED_SET, {cred: uniq_num_cred})`
			`server_cred.hmset(REDIS_KEY_ALL_CRED_SET_REV, {uniq_num_cred: cred})`
python 3 backend upgrade 2018-04-16 14:50:04 +02:00
Added comments 2017-07-20 10:24:48 +02:00			`#Add the mapping between the credential and the path`
Finished basic link web interface + solved some bugs 2017-07-20 10:04:30 +02:00			`server_cred.sadd(REDIS_KEY_MAP_CRED_TO_PATH+'_'+str(uniq_num_cred), uniq_num_path)`
Draft module + web link 2017-07-18 16:57:15 +02:00
Added comments 2017-07-20 10:24:48 +02:00			`#Split credentials on capital letters, numbers, dots and so on`
			`#Add the split to redis, each split point towards its initial credential unique number`
Draft module + web link 2017-07-18 16:57:15 +02:00			`splitedCred = re.findall(REGEX_CRED, cred)`
			`for partCred in splitedCred:`
update: Moved filtering operation (thresholds, number of matching in the categ file) in the configuration file. It permits to better control the flow of pastes. Also set default mixer duplicate filtering to 3 (Do not filter) 2017-12-11 17:28:34 +01:00			`if len(partCred) > minimumLengthThreshold:`
Added similarity checks + started link webInterface 2017-07-19 11:52:06 +02:00			`server_cred.sadd(partCred, uniq_num_cred)`