AIL-framework/bin/lib/item_basic.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import os
import sys
import gzip

sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader

config_loader = ConfigLoader.ConfigLoader()
# get and sanityze PASTE DIRECTORY
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')

r_cache = config_loader.get_redis_conn("Redis_Cache")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None

def exist_item(item_id):
    filename = get_item_filepath(item_id)
    if os.path.isfile(filename):
        return True
    else:
        return False

def get_item_filepath(item_id):
    filename = os.path.join(PASTES_FOLDER, item_id)
    return os.path.realpath(filename)

def get_item_date(item_id, add_separator=False):
    l_directory = item_id.split('/')
    if add_separator:
        return '{}/{}/{}'.format(l_directory[-4], l_directory[-3], l_directory[-2])
    else:
        return '{}{}{}'.format(l_directory[-4], l_directory[-3], l_directory[-2])

def get_source(item_id):
    return item_id.split('/')[-5]

# # TODO: add an option to check the tag
def is_crawled(item_id):
    return item_id.startswith('crawled')

def get_item_domain(item_id):
    return item_id[19:-36]

def get_item_content(item_id):
    item_full_path = os.path.join(PASTES_FOLDER, item_id)
    try:
        item_content = r_cache.get(item_full_path)
    except UnicodeDecodeError:
        item_content = None
    except Exception as e:
        item_content = None
    if item_content is None:
        try:
            with gzip.open(item_full_path, 'r') as f:
                item_content = f.read().decode()
                r_cache.set(item_full_path, item_content)
                r_cache.expire(item_full_path, 300)
        except:
            item_content = ''
    return str(item_content)

#### TREE CHILD/FATHER ####
def is_father(item_id):
    return r_serv_metadata.exists('paste_children:{}'.format(item_id))

def is_children(item_id):
    return r_serv_metadata.hexists('paste_metadata:{}'.format(item_id), 'father')

def is_root_node():
    if is_father(item_id) and not is_children(item_id):
        return True
    else:
        return False

def is_node(item_id):
    if is_father(item_id) or is_children(item_id):
        return True
    else:
        return False

def is_leaf(item_id):
    if not is_father(item_id) and is_children(item_id):
        return True
    else:
        return False

def is_domain_root(item_id):
    if not is_crawled(item_id):
        return False
    else:
        domain = get_item_domain(item_id)
        item_father = get_item_parent(item_id)
        if not is_crawled(item_father):
            return True
        else:
            # same domain
            if get_item_domain(item_father) == domain:
                return False
            else:
                return True

def get_nb_children(item_id):
    return r_serv_metadata.scard('paste_children:{}'.format(item_id))


def get_item_parent(item_id):
    return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father')

def get_item_children(item_id):
    return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id)))

# # TODO:  handle domain last origin in domain lib
def _delete_node(item_id):
    # only if item isn't deleted
    #if is_crawled(item_id):
    #    r_serv_metadata.hrem('paste_metadata:{}'.format(item_id), 'real_link')
    for chidren_id in get_item_children(item_id):
        r_serv_metadata.hdel('paste_metadata:{}'.format(chidren_id), 'father')
    r_serv_metadata.delete('paste_children:{}'.format(item_id))

    # delete regular
        # simple if leaf

    # delete item node

def get_all_domain_node_by_item_id(item_id, l_nodes=[]):
    domain = get_item_domain(item_id)
    for child_id in get_item_children(item_id):
        if get_item_domain(child_id) == domain:
            l_nodes.append(child_id)
            l_nodes = get_all_domain_node_by_item_id(child_id, l_nodes)
    return l_nodes

##--  --##


def add_item_parent_by_parent_id(parent_type, parent_id, item_id):
    parent_item_id = get_obj_id_item_id(parent_type, parent_id)
    if parent_item_id:
        add_item_parent(parent_item_id, item_id)

def add_item_parent(parent_item_id, item_id):
    r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', parent_item_id)
    r_serv_metadata.sadd('paste_children:{}'.format(parent_item_id), item_id)
    return True

# TODO:
# FIXME:
#### UNKNOW SECTION ####

def get_obj_id_item_id(parent_type, parent_id):
    all_parents_type = ['twitter_id']
    if parent_type in all_parents_type:
        return r_serv_metadata.hget('map:twitter_id:item_id', parent_id)
    else:
        return None

def add_map_obj_id_item_id(obj_id, item_id, obj_type):
    if obj_type == 'twitter_id':
        r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id)

# delete twitter id

##--  --##

## COMMON ##
def _get_dir_source_name(directory, source_name=None, l_sources_name=set()):
    if source_name:
        l_dir = os.listdir(os.path.join(directory, source_name))
    else:
        l_dir = os.listdir(directory)
    # empty directory
    if not l_dir:
        return l_sources_name.add(source_name)
        return l_sources_name
    else:
        for src_name in l_dir:
            if len(src_name) == 4:
                try:
                    int(src_name)
                    l_sources_name.add(os.path.join(source_name))
                    return l_sources_name
                except:
                    pass
            if source_name:
                src_name = os.path.join(source_name, src_name)
            l_sources_name = _get_dir_source_name(directory, source_name=src_name, l_sources_name=l_sources_name)
    return l_sources_name


def get_all_items_sources():
    res = _get_dir_source_name(PASTES_FOLDER)
    print(res)

##--  --##


if __name__ == '__main__':
    get_all_items_sources()
chg: [core import] add AIL JSON import format, API endpoint added (AIL feeders) 2020-04-28 14:14:45 +02:00			`#!/usr/bin/env python3`
			`# --coding:UTF-8 -`

			`import os`
			`import sys`
chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain 2020-07-07 11:23:23 +02:00			`import gzip`
chg: [core import] add AIL JSON import format, API endpoint added (AIL feeders) 2020-04-28 14:14:45 +02:00
			`sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))`
			`import ConfigLoader`

			`config_loader = ConfigLoader.ConfigLoader()`
			`# get and sanityze PASTE DIRECTORY`
			`PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'`
			`PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')`
fix: [urlexport importer] fix item name + redis config 2020-05-03 15:40:43 +02:00
chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain 2020-07-07 11:23:23 +02:00			`r_cache = config_loader.get_redis_conn("Redis_Cache")`
fix: [urlexport importer] fix item name + redis config 2020-05-03 15:40:43 +02:00			`r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")`
chg: [core import] add AIL JSON import format, API endpoint added (AIL feeders) 2020-04-28 14:14:45 +02:00			`config_loader = None`

			`def exist_item(item_id):`
			`filename = get_item_filepath(item_id)`
			`if os.path.isfile(filename):`
			`return True`
			`else:`
			`return False`

			`def get_item_filepath(item_id):`
			`filename = os.path.join(PASTES_FOLDER, item_id)`
			`return os.path.realpath(filename)`
chg: [import er url_extract] add item twitter parent 2020-04-30 15:46:38 +02:00
chg: [API + item library] add a way to check if a user can access the API + item import (use item_basic) 2020-05-27 20:43:00 +02:00			`def get_item_date(item_id, add_separator=False):`
			`l_directory = item_id.split('/')`
			`if add_separator:`
			`return '{}/{}/{}'.format(l_directory[-4], l_directory[-3], l_directory[-2])`
			`else:`
			`return '{}{}{}'.format(l_directory[-4], l_directory[-3], l_directory[-2])`

fix: [MISP auto export] fix MISP_TheHive_feeder 2020-06-19 13:36:03 +02:00			`def get_source(item_id):`
			`return item_id.split('/')[-5]`

chg: [API + item library] add a way to check if a user can access the API + item import (use item_basic) 2020-05-27 20:43:00 +02:00			`# # TODO: add an option to check the tag`
			`def is_crawled(item_id):`
			`return item_id.startswith('crawled')`

			`def get_item_domain(item_id):`
			`return item_id[19:-36]`

chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain 2020-07-07 11:23:23 +02:00			`def get_item_content(item_id):`
			`item_full_path = os.path.join(PASTES_FOLDER, item_id)`
			`try:`
			`item_content = r_cache.get(item_full_path)`
			`except UnicodeDecodeError:`
			`item_content = None`
			`except Exception as e:`
			`item_content = None`
			`if item_content is None:`
			`try:`
			`with gzip.open(item_full_path, 'r') as f:`
			`item_content = f.read().decode()`
			`r_cache.set(item_full_path, item_content)`
			`r_cache.expire(item_full_path, 300)`
			`except:`
			`item_content = ''`
			`return str(item_content)`

			`#### TREE CHILD/FATHER ####`
			`def is_father(item_id):`
			`return r_serv_metadata.exists('paste_children:{}'.format(item_id))`

			`def is_children(item_id):`
			`return r_serv_metadata.hexists('paste_metadata:{}'.format(item_id), 'father')`

			`def is_root_node():`
			`if is_father(item_id) and not is_children(item_id):`
			`return True`
			`else:`
			`return False`

			`def is_node(item_id):`
			`if is_father(item_id) or is_children(item_id):`
			`return True`
			`else:`
			`return False`

			`def is_leaf(item_id):`
			`if not is_father(item_id) and is_children(item_id):`
			`return True`
			`else:`
			`return False`

			`def is_domain_root(item_id):`
			`if not is_crawled(item_id):`
			`return False`
			`else:`
			`domain = get_item_domain(item_id)`
			`item_father = get_item_parent(item_id)`
			`if not is_crawled(item_father):`
			`return True`
			`else:`
			`# same domain`
			`if get_item_domain(item_father) == domain:`
			`return False`
			`else:`
			`return True`

			`def get_nb_children(item_id):`
			`return r_serv_metadata.scard('paste_children:{}'.format(item_id))`


			`def get_item_parent(item_id):`
			`return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father')`

			`def get_item_children(item_id):`
			`return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id)))`

			`# # TODO: handle domain last origin in domain lib`
			`def _delete_node(item_id):`
			`# only if item isn't deleted`
			`#if is_crawled(item_id):`
			`# r_serv_metadata.hrem('paste_metadata:{}'.format(item_id), 'real_link')`
			`for chidren_id in get_item_children(item_id):`
			`r_serv_metadata.hdel('paste_metadata:{}'.format(chidren_id), 'father')`
			`r_serv_metadata.delete('paste_children:{}'.format(item_id))`

			`# delete regular`
			`# simple if leaf`

			`# delete item node`

			`def get_all_domain_node_by_item_id(item_id, l_nodes=[]):`
			`domain = get_item_domain(item_id)`
			`for child_id in get_item_children(item_id):`
			`if get_item_domain(child_id) == domain:`
			`l_nodes.append(child_id)`
			`l_nodes = get_all_domain_node_by_item_id(child_id, l_nodes)`
			`return l_nodes`

			`##-- --##`


chg: [importer] add map twitter id - item id, add parents link between twitter and url extracted 2020-05-05 11:24:33 +02:00			`def add_item_parent_by_parent_id(parent_type, parent_id, item_id):`
			`parent_item_id = get_obj_id_item_id(parent_type, parent_id)`
			`if parent_item_id:`
fix: [import urlextract parent] fix typo 2020-05-05 11:39:06 +02:00			`add_item_parent(parent_item_id, item_id)`
chg: [importer] add map twitter id - item id, add parents link between twitter and url extracted 2020-05-05 11:24:33 +02:00
			`def add_item_parent(parent_item_id, item_id):`
fix: [urlextract importer] fix parent map + replace set JSON queue by list 2020-05-06 16:57:13 +02:00			`r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', parent_item_id)`
			`r_serv_metadata.sadd('paste_children:{}'.format(parent_item_id), item_id)`
			`return True`
chg: [importer] add map twitter id - item id, add parents link between twitter and url extracted 2020-05-05 11:24:33 +02:00
chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain 2020-07-07 11:23:23 +02:00			`# TODO:`
			`# FIXME:`
			`#### UNKNOW SECTION ####`
chg: [importer] add map twitter id - item id, add parents link between twitter and url extracted 2020-05-05 11:24:33 +02:00
			`def get_obj_id_item_id(parent_type, parent_id):`
			`all_parents_type = ['twitter_id']`
			`if parent_type in all_parents_type:`
			`return r_serv_metadata.hget('map:twitter_id:item_id', parent_id)`
			`else:`
			`return None`
chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain 2020-07-07 11:23:23 +02:00
			`def add_map_obj_id_item_id(obj_id, item_id, obj_type):`
			`if obj_type == 'twitter_id':`
			`r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id)`

			`# delete twitter id`

			`##-- --##`
fix: [gitignore] 2021-02-10 15:27:31 +01:00
			`## COMMON ##`
			`def _get_dir_source_name(directory, source_name=None, l_sources_name=set()):`
			`if source_name:`
			`l_dir = os.listdir(os.path.join(directory, source_name))`
			`else:`
			`l_dir = os.listdir(directory)`
			`# empty directory`
			`if not l_dir:`
			`return l_sources_name.add(source_name)`
			`return l_sources_name`
			`else:`
			`for src_name in l_dir:`
			`if len(src_name) == 4:`
			`try:`
			`int(src_name)`
			`l_sources_name.add(os.path.join(source_name))`
			`return l_sources_name`
			`except:`
			`pass`
			`if source_name:`
			`src_name = os.path.join(source_name, src_name)`
			`l_sources_name = _get_dir_source_name(directory, source_name=src_name, l_sources_name=l_sources_name)`
			`return l_sources_name`


			`def get_all_items_sources():`
			`res = _get_dir_source_name(PASTES_FOLDER)`
			`print(res)`

			`##-- --##`


			`if __name__ == '__main__':`
			`get_all_items_sources()`