AIL-framework/bin/packages/Item.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import base64
import os
import re
import sys
import redis
import cld3
import html2text

from io import BytesIO

sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
import Date
import Tag
import Cryptocurrency
import Pgp

sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import item_basic
import domain_basic
import ConfigLoader
import Correlate_object
import Decoded
import Screenshot
import Username

from ail_objects import AbstractObject
from item_basic import *

config_loader = ConfigLoader.ConfigLoader()
# get and sanityze PASTE DIRECTORY
# # TODO: rename PASTES_FOLDER
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')

r_cache = config_loader.get_redis_conn("Redis_Cache")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
screenshot_directory = config_loader.get_files_directory('screenshot')
har_directory = config_loader.get_files_directory('har')

config_loader = None

def exist_item(item_id):
    return item_basic.exist_item(item_id)

def get_basename(item_id):
    return os.path.basename(item_id)

def get_item_id(full_path):
    return full_path.replace(PASTES_FOLDER, '', 1)

def get_item_filepath(item_id):
    return item_basic.get_item_filepath(item_id)

def get_item_date(item_id, add_separator=False):
    return item_basic.get_item_date(item_id, add_separator=add_separator)

def get_source(item_id):
    return item_basic.get_source(item_id)

def get_all_sources():
    return item_basic.get_all_items_sources(r_list=True)

def get_item_basename(item_id):
    return os.path.basename(item_id)

def get_item_size(item_id):
    return round(os.path.getsize(os.path.join(PASTES_FOLDER, item_id))/1024.0, 2)

def get_item_encoding(item_id):
    return None

def get_lines_info(item_id, item_content=None):
    if not item_content:
        item_content = get_item_content(item_id)
    max_length = 0
    line_id = 0
    nb_line = 0
    for line in item_content.splitlines():
        length = len(line)
        if length > max_length:
            max_length = length
        nb_line += 1
    return {'nb': nb_line, 'max_length': max_length}


def get_item_metadata(item_id, item_content=None):
    ## TODO: FIXME ##performance
    # encoding
    # language
    # lines info
    item_metadata = {'date': get_item_date(item_id, add_separator=True),
                     'source': get_source(item_id),
                     'size': get_item_size(item_id),
                     'encoding': get_item_encoding(item_id),
                     'lines': get_lines_info(item_id, item_content=item_content)
                     }
    return item_metadata

def get_item_parent(item_id):
    return item_basic.get_item_parent(item_id)

def add_item_parent(item_parent, item_id):
    return item_basic.add_item_parent(item_parent, item_id)

def get_item_content(item_id):
    return item_basic.get_item_content(item_id)

def get_item_content_html2text(item_id, item_content=None, ignore_links=False):
    if not item_content:
        item_content = get_item_content(item_id)
    h = html2text.HTML2Text()
    h.ignore_links = ignore_links
    h.ignore_images = ignore_links
    return h.handle(item_content)

def remove_all_urls_from_content(item_id, item_content=None):
    if not item_content:
        item_content = get_item_content(item_id)
    regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
    url_regex = re.compile(regex)
    urls = url_regex.findall(item_content)
    urls = sorted(urls, key=len, reverse=True)
    for url in urls:
        item_content = item_content.replace(url, '')

    regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
    regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
    regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
    re.compile(regex_pgp_public_blocs)
    re.compile(regex_pgp_signature)
    re.compile(regex_pgp_message)

    res = re.findall(regex_pgp_public_blocs, item_content)
    for it in res:
        item_content = item_content.replace(it, '')
    res = re.findall(regex_pgp_signature, item_content)
    for it in res:
        item_content = item_content.replace(it, '')
    res = re.findall(regex_pgp_message, item_content)
    for it in res:
        item_content = item_content.replace(it, '')

    return item_content

def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
    all_languages = []

    ## CLEAN CONTENT ##
    content = get_item_content_html2text(item_id, ignore_links=True)
    content = remove_all_urls_from_content(item_id, item_content=content)

    # REMOVE USELESS SPACE
    content = ' '.join(content.split())
    #- CLEAN CONTENT -#

    #print(content)
    #print(len(content))
    if len(content) >= min_len:
        for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
            if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
                all_languages.append(lang)
    return all_languages

# API
def get_item(request_dict):
    if not request_dict:
        return {'status': 'error', 'reason': 'Malformed JSON'}, 400

    item_id = request_dict.get('id', None)
    if not item_id:
        return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
    if not exist_item(item_id):
        return {'status': 'error', 'reason': 'Item not found'}, 404

    dict_item = {}
    dict_item['id'] = item_id
    date = request_dict.get('date', True)
    if date:
        add_separator = False
        if request_dict.get('date_separator', False):
            add_separator = True
        dict_item['date'] = get_item_date(item_id, add_separator=add_separator)
    tags = request_dict.get('tags', True)
    if tags:
        dict_item['tags'] = Tag.get_obj_tag(item_id)

    size = request_dict.get('size', False)
    if size:
        dict_item['size'] = get_item_size(item_id)

    content = request_dict.get('content', False)
    if content:
        # UTF-8 outpout, # TODO: use base64
        dict_item['content'] = get_item_content(item_id)

    raw_content = request_dict.get('raw_content', False)
    if raw_content:
        dict_item['raw_content'] = get_raw_content(item_id)

    lines_info = request_dict.get('lines', False)
    if lines_info:
        dict_item['lines'] = get_lines_info(item_id, dict_item.get('content', 'None'))

    if request_dict.get('pgp'):
        dict_item['pgp'] = {}
        if request_dict['pgp'].get('key'):
            dict_item['pgp']['key'] = get_item_pgp_key(item_id)
        if request_dict['pgp'].get('mail'):
            dict_item['pgp']['mail'] = get_item_pgp_mail(item_id)
        if request_dict['pgp'].get('name'):
            dict_item['pgp']['name'] = get_item_pgp_name(item_id)

    if request_dict.get('cryptocurrency'):
        dict_item['cryptocurrency'] = {}
        if request_dict['cryptocurrency'].get('bitcoin'):
            dict_item['cryptocurrency']['bitcoin'] = get_item_bitcoin(item_id)

    return dict_item, 200


def api_get_item_content_base64_utf8(request_dict):
    item_id = request_dict.get('id', None)
    if not request_dict:
        return {'status': 'error', 'reason': 'Malformed JSON'}, 400
    if not item_id:
        return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
    if not exist_item(item_id):
        return {'status': 'error', 'reason': 'Item not found'}, 404

    item_content = get_item_content(item_id)
    item_content = base64.b64encode((item_content.encode('utf-8'))).decode('UTF-8')
    return {'status': 'success', 'content': item_content}, 200


def api_get_items_sources():
    item_content = {'sources': get_all_sources()}
    return item_content, 200

# def check_item_source(request_dict):
#     source = request_dict.get('source', None)
#     if not request_dict:
#         return {'status': 'error', 'reason': 'Malformed JSON'}, 400
#     if not source:
#         return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
#
#     all_sources = item_basic.get_all_items_sources()
#
#     if source not in all_sources:
#         return {'status': 'error', 'reason': 'Invalid source', 'provide': source}, 400
#     return {'status': 'success', 'reason': 'Valid source', 'provide': source}, 200

###
### correlation
###
def get_item_cryptocurrency(item_id, currencies_type=None, get_nb=False):
    '''
    Return all cryptocurrencies of a given item.

    :param item_id: item id
    :param currencies_type: list of cryptocurrencies type
    :type currencies_type: list, optional
    '''
    return Cryptocurrency.cryptocurrency.get_item_correlation_dict(item_id, correlation_type=currencies_type, get_nb=get_nb)

def get_item_pgp(item_id, currencies_type=None, get_nb=False):
    '''
    Return all pgp of a given item.

    :param item_id: item id
    :param currencies_type: list of cryptocurrencies type
    :type currencies_type: list, optional
    '''
    return Pgp.pgp.get_item_correlation_dict(item_id, correlation_type=currencies_type, get_nb=get_nb)

def get_item_username(item_id, sub_type=None, get_nb=False):
    '''
    Return all pgp of a given item.

    :param item_id: item id
    :param sub_type: list of username type
    :type sub_type: list, optional
    '''
    return Username.correlation.get_item_correlation_dict(item_id, correlation_type=sub_type, get_nb=get_nb)

def get_item_decoded(item_id):
    '''
    Return all pgp of a given item.

    :param item_id: item id
    :param currencies_type: list of cryptocurrencies type
    :type currencies_type: list, optional
    '''
    return Decoded.get_item_decoded(item_id)

def get_item_all_screenshot(item_id):
    '''
    Return all screenshot of a given item.

    :param item_id: item id
    '''
    return Screenshot.get_item_screenshot_list(item_id)

def get_item_all_correlation(item_id, correlation_names=[], get_nb=False):
    '''
    Retun all correlation of a given item id.

    :param item_id: item id
    :type domain: str

    :return: a dict of all correlation for a item id
    :rtype: dict
    '''
    if not correlation_names:
        correlation_names = Correlate_object.get_all_correlation_names()
    item_correl = {}
    for correlation_name in correlation_names:
        if correlation_name=='cryptocurrency':
            res = get_item_cryptocurrency(item_id, get_nb=get_nb)
        elif correlation_name=='pgp':
            res = get_item_pgp(item_id, get_nb=get_nb)
        elif correlation_name=='username':
            res = get_item_username(item_id, get_nb=get_nb)
        elif correlation_name=='decoded':
            res = get_item_decoded(item_id)
        elif correlation_name=='screenshot':
            res = get_item_all_screenshot(item_id)
        else:
            res = None
        # add correllation to dict
        if res:
            item_correl[correlation_name] = res
    return item_correl


## TODO: REFRACTOR
def _get_item_correlation(correlation_name, correlation_type, item_id):
    res = r_serv_metadata.smembers('item_{}_{}:{}'.format(correlation_name, correlation_type, item_id))
    if res:
        return list(res)
    else:
        return []

## TODO: REFRACTOR
def get_item_bitcoin(item_id):
    return _get_item_correlation('cryptocurrency', 'bitcoin', item_id)

## TODO: REFRACTOR
def get_item_pgp_key(item_id):
    return _get_item_correlation('pgpdump', 'key', item_id)

## TODO: REFRACTOR
def get_item_pgp_name(item_id):
    return _get_item_correlation('pgpdump', 'name', item_id)

## TODO: REFRACTOR
def get_item_pgp_mail(item_id):
    return _get_item_correlation('pgpdump', 'mail', item_id)

## TODO: REFRACTOR
def get_item_pgp_correlation(item_id):
    pass

###
### GET Internal Module DESC
###
def get_item_list_desc(list_item_id):
    desc_list = []
    for item_id in list_item_id:
        desc_list.append( {'id': item_id, 'date': get_item_date(item_id), 'tags': Tag.get_obj_tag(item_id)} )
    return desc_list

def is_crawled(item_id):
    return item_basic.is_crawled(item_id)

def get_crawler_matadata(item_id, ltags=None):
    dict_crawler = {}
    if is_crawled(item_id):
        dict_crawler['domain'] = get_item_domain(item_id)
        if not ltags:
            ltags = Tag.get_obj_tag(item_id)
        dict_crawler['is_tags_safe'] = Tag.is_tags_safe(ltags)
        dict_crawler['url'] = get_item_link(item_id)
        dict_crawler['screenshot'] = get_item_screenshot(item_id)
        dict_crawler['har'] = get_item_har_name(item_id)
    return dict_crawler

def is_onion(item_id):
    is_onion = False
    if len(is_onion) > 62:
        if is_crawled(item_id) and item_id[-42:-36] == '.onion':
            is_onion = True
    return is_onion

def is_item_in_domain(domain, item_id):
    is_in_domain = False
    domain_lenght = len(domain)
    if len(item_id) > (domain_lenght+48):
        if item_id[-36-domain_lenght:-36] == domain:
            is_in_domain = True
    return is_in_domain

def get_item_domain(item_id):
    return item_basic.get_item_domain(item_id)

def get_domain(item_id):
    item_id = item_id.split('/')
    item_id = item_id[-1]
    return item_id[:-36]

def get_item_domain_with_port(item_id):
    return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'domain')

def get_item_link(item_id):
    return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'real_link')

def get_item_screenshot(item_id):
    screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'screenshot')
    if screenshot:
        return os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
    return ''

def get_item_har_name(item_id):
    har_path = os.path.join(har_directory, item_id) + '.json'
    if os.path.isfile(har_path):
        return har_path
    else:
        return None

def get_item_har(har_path):
    pass

def get_item_filename(item_id):
    # Creating the full filepath
    filename = os.path.join(PASTES_FOLDER, item_id)
    filename = os.path.realpath(filename)

    # incorrect filename
    if not os.path.commonprefix([filename, PASTES_FOLDER]) == PASTES_FOLDER:
        return None
    else:
        return filename

def get_item_duplicate(item_id, r_list=True):
    res = r_serv_metadata.smembers('dup:{}'.format(item_id))
    if r_list:
        if res:
            return list(res)
        else:
            return []
    return res

def get_item_nb_duplicates(item_id):
    return r_serv_metadata.scard('dup:{}'.format(item_id))

def get_item_duplicates_dict(item_id):
    dict_duplicates = {}
    for duplicate in get_item_duplicate(item_id):
        duplicate = duplicate[1:-1].replace('\'', '').replace(' ', '').split(',')
        duplicate_id = duplicate[1]
        if not duplicate_id in dict_duplicates:
            dict_duplicates[duplicate_id] = {'date': get_item_date(duplicate_id, add_separator=True), 'algo': {}}
        algo = duplicate[0]
        if algo == 'tlsh':
            similarity = 100 - int(duplicate[2])
        else:
            similarity = int(duplicate[2])
        dict_duplicates[duplicate_id]['algo'][algo] = similarity
    return dict_duplicates

def add_item_duplicate(item_id, l_dup):
    for item_dup in l_dup:
        r_serv_metadata.sadd('dup:{}'.format(item_dup), item_id)
        r_serv_metadata.sadd('dup:{}'.format(item_id), item_dup)

def delete_item_duplicate(item_id):
    item_dup = get_item_duplicate(item_id)
    for item_dup in get_item_duplicate(item_id):
        r_serv_metadata.srem('dup:{}'.format(item_dup), item_id)
    r_serv_metadata.delete('dup:{}'.format(item_id))

def get_raw_content(item_id):
    filepath = get_item_filepath(item_id)
    with open(filepath, 'rb') as f:
        file_content = BytesIO(f.read())
    return file_content

def save_raw_content(item_id, io_content):
    filepath = get_item_filename(item_id)
    if os.path.isfile(filepath):
        #print('File already exist')
        return False
    # create subdir
    dirname = os.path.dirname(filepath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    # # TODO: check if is IO file
    with open(filepath, 'wb') as f:
        f.write(io_content.getvalue())
    return True

# IDEA: send item to duplicate ?
def create_item(obj_id, obj_metadata, io_content):
    '''
    Create a new Item (Import or Test only).

    :param obj_id: item id
    :type obj_metadata: dict - 'first_seen', 'tags'

    :return: is item created
    :rtype: boolean
    '''
    # check if datetime match ??


    # # TODO: validate obj_id

    res = save_raw_content(obj_id, io_content)
    # item saved
    if res:
        # creata tags
        if 'tags' in obj_metadata:
            # # TODO: handle mixed tags: taxonomies and Galaxies
            Tag.api_add_obj_tags(tags=obj_metadata['tags'], object_id=obj_id, object_type="item")
        return True

    # Item not created
    return False

def delete_item(obj_id):
    # check if item exists
    if not exist_item(obj_id):
        return False
    else:
        delete_item_duplicate(obj_id)
        # delete MISP event
        r_serv_metadata.delete('misp_events:{}'.format(obj_id))
        r_serv_metadata.delete('hive_cases:{}'.format(obj_id))

        os.remove(get_item_filename(obj_id))

        # get all correlation
        obj_correlations = get_item_all_correlation(obj_id)
        for correlation in obj_correlations:
            if correlation=='cryptocurrency' or correlation=='pgp':
                for obj2_subtype in obj_correlations[correlation]:
                    for obj2_id in obj_correlations[correlation][obj2_subtype]:
                        Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id,
                                                            obj1_subtype=obj2_subtype)
            else:
                for obj2_id in obj_correlations[correlation]:
                    Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id)

        # delete father/child
        delete_node(obj_id)

        # delete item metadata
        r_serv_metadata.delete('paste_metadata:{}'.format(obj_id))

        return True

    ### TODO in inport V2
    # delete from tracked items

    # # # TODO: # FIXME: LATER
    # delete from queue
    ###
    return False

#### ####
def delete_node(item_id):
    if is_node(item_id):
        if is_crawled(item_id):
            delete_domain_node(item_id)
        item_basic._delete_node(item_id)

def delete_domain_node(item_id):
    if is_domain_root(item_id):
        # remove from domain history
        domain, port = get_item_domain_with_port(item_id).split(':')
        domain_basic.delete_domain_item_core(item_id, domain, port)
    for child_id in get_all_domain_node_by_item_id(item_id):
        delete_item(child_id)


class Item(AbstractObject):
    """
    AIL Item Object. (strings)
    """

    def __init__(self, id):
        super(Item, self).__init__('item', id)

    def get_date(self, separator=False):
        """
        Returns Item date
        """
        return item_basic.get_item_date(self.id, add_separator=separator)

    def get_source(self):
        """
        Returns Item source/feeder name
        """
        #return self.id.split('/')[-5]
        l_source = self.id.split('/')[:-4]
        return os.path.join(*l_source)

    def get_basename(self):
        return os.path.basename(self.id)

    def get_filename(self):
        # Creating the full filepath
        filename = os.path.join(PASTES_FOLDER, self.id)
        filename = os.path.realpath(filename)

        # incorrect filename
        if not os.path.commonprefix([filename, PASTES_FOLDER]) == PASTES_FOLDER:
            return None
        else:
            return filename

    def get_content(self):
        """
        Returns Item content
        """
        return item_basic.get_item_content(self.id)

    def get_gzip_content(self, b64=False):
        with open(self.get_filename(), 'rb') as f:
            content = f.read()
        if b64:
            content = base64.b64encode(content)
        return content.decode()

    def get_ail_2_ail_payload(self):
        payload = {'raw': self.get_gzip_content(b64=True),
                    'compress': 'gzip'}
        return payload

    # # TODO:
    def create(self):
        pass

    # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
    # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
    def delete(self):
        try:
            os.remove(self.get_filename())
            return True
        except FileNotFoundError:
            return False

    ############################################################################
    ############################################################################
    ############################################################################

    def exist_correlation(self):
        pass

    ############################################################################
    ############################################################################
    ############################################################################
    ############################################################################
    ############################################################################
    ############################################################################
    ############################################################################
    ############################################################################

#if __name__ == '__main__':


#     import Domain
#     domain = Domain.Domain('domain.onion')
#     for domain_history in domain.get_domain_history():
#         domain_item = domain.get_domain_items_crawled(epoch=domain_history[1]) # item_tag
#         if "items" in domain_item:
#             for item_dict in domain_item['items']:
#                 item_id = item_dict['id']
#                 print(item_id)
#                 for lang in get_item_languages(item_id, min_proportion=0.2, min_probability=0.8):
#                     print(lang)
#                 print()
#     print(get_item_languages(item_id, min_proportion=0.2, min_probability=0.6)) # 0.7 ?