mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			955 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			955 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| 
 | |
| import base64
 | |
| import gzip
 | |
| import magic
 | |
| import os
 | |
| import re
 | |
| import sys
 | |
| import cld3
 | |
| import html2text
 | |
| 
 | |
| from io import BytesIO
 | |
| from uuid import uuid4
 | |
| 
 | |
| from pymisp import MISPObject
 | |
| 
 | |
| sys.path.append(os.environ['AIL_BIN'])
 | |
| ##################################
 | |
| # Import Project packages
 | |
| ##################################
 | |
| from lib.ail_core import get_ail_uuid, rreplace
 | |
| from lib.objects.abstract_object import AbstractObject
 | |
| from lib.ConfigLoader import ConfigLoader
 | |
| from lib import item_basic
 | |
| from lib.data_retention_engine import update_obj_date, get_obj_date_first
 | |
| from packages import Date
 | |
| 
 | |
| 
 | |
| from flask import url_for
 | |
| 
 | |
| config_loader = ConfigLoader()
 | |
| # # TODO:  get and sanitize ITEMS DIRECTORY
 | |
| ITEMS_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
 | |
| ITEMS_FOLDER = os.path.join(os.path.realpath(ITEMS_FOLDER), '')
 | |
| 
 | |
| r_cache = config_loader.get_redis_conn("Redis_Cache")
 | |
| r_object = config_loader.get_db_conn("Kvrocks_Objects")
 | |
| screenshot_directory = config_loader.get_files_directory('screenshot')
 | |
| har_directory = config_loader.get_files_directory('har')
 | |
| baseurl = config_loader.get_config_str("Notifications", "ail_domain")
 | |
| config_loader = None
 | |
| 
 | |
| 
 | |
| class Item(AbstractObject):
 | |
|     """
 | |
|     AIL Item Object. (strings)
 | |
|     """
 | |
| 
 | |
|     def __init__(self, id):
 | |
|         super(Item, self).__init__('item', id)
 | |
| 
 | |
|     def exists(self):
 | |
|         return item_basic.exist_item(self.id)
 | |
| 
 | |
|     def get_date(self, separator=False):
 | |
|         """
 | |
|         Returns Item date
 | |
|         """
 | |
|         return item_basic.get_item_date(self.id, add_separator=separator)
 | |
| 
 | |
|     def get_source(self):
 | |
|         """
 | |
|         Returns Item source/feeder name
 | |
|         """
 | |
|         # return self.id.split('/')[-5]
 | |
|         l_source = self.id.split('/')[:-4]
 | |
|         return os.path.join(*l_source)
 | |
| 
 | |
|     def get_basename(self):
 | |
|         return os.path.basename(self.id)
 | |
| 
 | |
|     def get_filename(self):
 | |
|         # Creating the full filepath
 | |
|         filename = os.path.join(ITEMS_FOLDER, self.id)
 | |
|         filename = os.path.realpath(filename)
 | |
| 
 | |
|         # incorrect filename
 | |
|         if not os.path.commonprefix([filename, ITEMS_FOLDER]) == ITEMS_FOLDER:
 | |
|             return None
 | |
|         else:
 | |
|             return filename
 | |
| 
 | |
|     def get_content(self, r_type='str'):
 | |
|         """
 | |
|         Returns Item content
 | |
|         """
 | |
|         if r_type == 'str':
 | |
|             return item_basic.get_item_content(self.id)
 | |
|         elif r_type == 'bytes':
 | |
|             return item_basic.get_item_content_binary(self.id)
 | |
| 
 | |
|     def get_raw_content(self, decompress=False):
 | |
|         filepath = self.get_filename()
 | |
|         if decompress:
 | |
|             raw_content = BytesIO(self.get_content(r_type='bytes'))
 | |
|         else:
 | |
|             with open(filepath, 'rb') as f:
 | |
|                 raw_content = BytesIO(f.read())
 | |
|         return raw_content
 | |
| 
 | |
|     def get_gzip_content(self, b64=False):
 | |
|         with open(self.get_filename(), 'rb') as f:
 | |
|             content = f.read()
 | |
|         if b64:
 | |
|             content = base64.b64encode(content)
 | |
|         return content.decode()
 | |
| 
 | |
|     def get_html2text_content(self, content=None, ignore_links=False):
 | |
|         if not content:
 | |
|             content = self.get_content()
 | |
|         h = html2text.HTML2Text()
 | |
|         h.ignore_links = ignore_links
 | |
|         h.ignore_images = ignore_links
 | |
|         return h.handle(content)
 | |
| 
 | |
|     def get_size(self, r_str=False):
 | |
|         size = os.path.getsize(self.get_filename())/1024.0
 | |
|         if r_str:
 | |
|             size = round(size, 2)
 | |
|         return size
 | |
| 
 | |
|     def get_ail_2_ail_payload(self):
 | |
|         payload = {'raw': self.get_gzip_content(b64=True)}
 | |
|         return payload
 | |
| 
 | |
|     def get_parent(self):
 | |
|         return item_basic.get_item_parent(self.id)
 | |
| 
 | |
|     def set_parent(self, parent_id):
 | |
|         r_object.sadd(f'child:item::{parent_id}', self.id)
 | |
|         r_object.hset(f'meta:item::{self.id}', 'parent', parent_id)
 | |
| 
 | |
|     def add_children(self, child_id):
 | |
|         r_object.sadd(f'child:item::{self.id}', child_id)
 | |
|         r_object.hset(f'meta:item::{child_id}', 'parent', self.id)
 | |
| 
 | |
| ####################################################################################
 | |
| ####################################################################################
 | |
| 
 | |
|     # TODO ADD function to check if ITEM (content + file) already exists
 | |
| 
 | |
|     def sanitize_id(self):
 | |
|         if ITEMS_FOLDER in self.id:
 | |
|             self.id = self.id.replace(ITEMS_FOLDER, '', 1)
 | |
| 
 | |
|         # limit filename length
 | |
|         basename = self.get_basename()
 | |
|         if len(basename) > 255:
 | |
|             new_basename = f'{basename[:215]}{str(uuid4())}.gz'
 | |
|             self.id = rreplace(self.id, basename, new_basename, 1)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|         return self.id
 | |
| 
 | |
|     # # TODO: sanitize_id
 | |
|     # # TODO: check if already exists ?
 | |
|     # # TODO: check if duplicate
 | |
|     def save_on_disk(self, content, binary=True, compressed=False, b64=False):
 | |
|         if not binary:
 | |
|             content = content.encode()
 | |
|         if b64:
 | |
|             content = base64.standard_b64decode(content)
 | |
|         if not compressed:
 | |
|             content = gzip.compress(content)
 | |
| 
 | |
|         # # TODO: # FIXME: raise Exception id filename is None ######
 | |
|         filename = self.get_filename()
 | |
|         dirname = os.path.dirname(filename)
 | |
|         if not os.path.exists(dirname):
 | |
|             os.makedirs(dirname)
 | |
|         with open(filename, 'wb') as f:
 | |
|             f.write(content)
 | |
| 
 | |
|     # # TODO:
 | |
|     # correlations
 | |
|     # content
 | |
|     # tags
 | |
|     # origin
 | |
|     # duplicate -> all item iterations ???
 | |
|     #
 | |
|     def create(self, content, tags, father=None, duplicates=[], _save=True):
 | |
|         if _save:
 | |
|             self.save_on_disk(content, binary=True, compressed=False, base64=False)
 | |
| 
 | |
|         # # TODO:
 | |
|         # for tag in tags:
 | |
|         #     self.add_tag(tag)
 | |
| 
 | |
|         if father:
 | |
|             pass
 | |
| 
 | |
|         for obj_id in duplicates:
 | |
|             for dup in duplicates[obj_id]:
 | |
|                 self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
 | |
| 
 | |
| 
 | |
|     # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
 | |
|     # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
 | |
|     def delete(self):
 | |
|         self._delete()
 | |
|         try:
 | |
|             os.remove(self.get_filename())
 | |
|             return True
 | |
|         except FileNotFoundError:
 | |
|             return False
 | |
| 
 | |
| ####################################################################################
 | |
| ####################################################################################
 | |
| 
 | |
|     def get_link(self, flask_context=False):
 | |
|         if flask_context:
 | |
|             url = url_for('objects_item.showItem', id=self.id)
 | |
|         else:
 | |
|             url = f'{baseurl}/object/item?id={self.id}'
 | |
|         return url
 | |
| 
 | |
|     def get_svg_icon(self):
 | |
|         if is_crawled(self.id):
 | |
|             color = 'red'
 | |
|         else:
 | |
|             color = '#332288'
 | |
|         return {'style': '', 'icon': '', 'color': color, 'radius': 5}
 | |
| 
 | |
|     def get_misp_object(self):
 | |
|         obj = MISPObject('ail-leak', standalone=True)
 | |
|         obj_date = self.get_date()
 | |
|         if obj_date:
 | |
|             obj.first_seen = obj_date
 | |
|         else:
 | |
|             self.logger.warning(
 | |
|                 f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={obj_date}')
 | |
| 
 | |
|         obj_attrs = [obj.add_attribute('first-seen', value=obj_date),
 | |
|                      obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()),
 | |
|                      obj.add_attribute('sensor', value=get_ail_uuid())]
 | |
|         for obj_attr in obj_attrs:
 | |
|             for tag in self.get_tags():
 | |
|                 obj_attr.add_tag(tag)
 | |
|         return obj
 | |
| 
 | |
|     def exist_correlation(self):
 | |
|         pass
 | |
| 
 | |
|     def is_crawled(self):
 | |
|         return self.id.startswith('crawled')
 | |
| 
 | |
|     # if is_crawled
 | |
|     def get_domain(self):
 | |
|         return self.id[19:-36]
 | |
| 
 | |
|     def get_screenshot(self):
 | |
|         s = self.get_correlation('screenshot')
 | |
|         if s.get('screenshot'):
 | |
|             s = s['screenshot'].pop()[1:]
 | |
|             return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:])
 | |
| 
 | |
|     def get_har(self):
 | |
|         har_path = os.path.join(har_directory, self.id) + '.json'
 | |
|         if os.path.isfile(har_path):
 | |
|             return har_path
 | |
|         else:
 | |
|             return None
 | |
| 
 | |
|     def get_url(self):
 | |
|         return r_object.hget(f'meta:item::{self.id}', 'url')
 | |
| 
 | |
|     def set_crawled(self, url, parent_id):
 | |
|         r_object.hset(f'meta:item::{self.id}', 'url', url)
 | |
|         self.set_parent(parent_id)
 | |
| 
 | |
|     # options: set of optional meta fields
 | |
|     def get_meta(self, options=None):
 | |
|         """
 | |
|         :type options: set
 | |
|         """
 | |
|         if options is None:
 | |
|             options = set()
 | |
|         meta = self.get_default_meta(tags=True)
 | |
|         meta['date'] = self.get_date(separator=True)
 | |
|         meta['source'] = self.get_source()
 | |
|         # optional meta fields
 | |
|         if 'content' in options:
 | |
|             meta['content'] = self.get_content()
 | |
|         if 'crawler' in options:
 | |
|             if self.is_crawled():
 | |
|                 tags = meta.get('tags')
 | |
|                 meta['crawler'] = self.get_meta_crawler(tags=tags)
 | |
|         if 'duplicates' in options:
 | |
|             meta['duplicates'] = self.get_duplicates()
 | |
|         if 'lines' in options:
 | |
|             content = meta.get('content')
 | |
|             meta['lines'] = self.get_meta_lines(content=content)
 | |
|         if 'parent' in options:
 | |
|             meta['parent'] = self.get_parent()
 | |
|         if 'size' in options:
 | |
|             meta['size'] = self.get_size(r_str=True)
 | |
|         if 'mimetype' in options:
 | |
|             content = meta.get('content')
 | |
|             meta['mimetype'] = self.get_mimetype(content=content)
 | |
|         if 'investigations' in options:
 | |
|             meta['investigations'] = self.get_investigations()
 | |
|         if 'link' in options:
 | |
|             meta['link'] = self.get_link(flask_context=True)
 | |
| 
 | |
|         # meta['encoding'] = None
 | |
|         return meta
 | |
| 
 | |
|     def get_meta_crawler(self, tags=None):
 | |
|         """
 | |
|         :type tags: list
 | |
|         """
 | |
|         if tags is None:
 | |
|             tags = []
 | |
|         crawler = {}
 | |
|         if self.is_crawled():
 | |
|             crawler['domain'] = self.get_domain()
 | |
|             crawler['har'] = self.get_har()
 | |
|             crawler['screenshot'] = self.get_screenshot()
 | |
|             crawler['url'] = self.get_url()
 | |
|             if not tags:
 | |
|                 tags = self.get_tags()
 | |
|             crawler['is_tags_safe'] = self.is_tags_safe(tags)
 | |
|         return crawler
 | |
| 
 | |
|     def get_meta_lines(self, content=None):
 | |
|         if not content:
 | |
|             content = self.get_content()
 | |
|         max_length = 0
 | |
|         nb_line = 0
 | |
|         for line in content.splitlines():
 | |
|             length = len(line)
 | |
|             if length > max_length:
 | |
|                 max_length = length
 | |
|             nb_line += 1
 | |
|         return {'nb': nb_line, 'max_length': max_length}
 | |
| 
 | |
|     def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
 | |
|         all_languages = []
 | |
|         ## CLEAN CONTENT ##
 | |
|         content = self.get_html2text_content(ignore_links=True)
 | |
|         content = remove_all_urls_from_content(self.id, item_content=content) ##########################################
 | |
|         # REMOVE USELESS SPACE
 | |
|         content = ' '.join(content.split())
 | |
|         #- CLEAN CONTENT -#
 | |
|         #print(content)
 | |
|         #print(len(content))
 | |
|         if len(content) >= min_len: # # TODO:  # FIXME: check num langs limit
 | |
|             for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
 | |
|                 if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
 | |
|                     all_languages.append(lang)
 | |
|         return all_languages
 | |
| 
 | |
|     def get_mimetype(self, content=None):
 | |
|         if not content:
 | |
|             content = self.get_content()
 | |
|         return magic.from_buffer(content, mime=True)
 | |
| 
 | |
|     ############################################################################
 | |
|     ############################################################################
 | |
| 
 | |
| def _get_dir_source_name(directory, source_name=None, l_sources_name=None, filter_dir=False):
 | |
|     """
 | |
|     :type l_sources_name: set
 | |
|     """
 | |
|     if not l_sources_name:
 | |
|         l_sources_name = set()
 | |
|     if source_name:
 | |
|         l_dir = os.listdir(os.path.join(directory, source_name))
 | |
|     else:
 | |
|         l_dir = os.listdir(directory)
 | |
|     # empty directory
 | |
|     if not l_dir:
 | |
|         return l_sources_name.add(source_name)
 | |
|     else:
 | |
|         for src_name in l_dir:
 | |
|             if len(src_name) == 4:
 | |
|                 # try:
 | |
|                 int(src_name)
 | |
|                 to_add = os.path.join(source_name)
 | |
|                 # filter sources, remove first directory
 | |
|                 if filter_dir:
 | |
|                     to_add = to_add.replace('archive/', '').replace('alerts/', '')
 | |
|                 l_sources_name.add(to_add)
 | |
|                 return l_sources_name
 | |
|                 # except:
 | |
|                 #    pass
 | |
|             if source_name:
 | |
|                 src_name = os.path.join(source_name, src_name)
 | |
|             l_sources_name = _get_dir_source_name(directory, source_name=src_name, l_sources_name=l_sources_name, filter_dir=filter_dir)
 | |
|     return l_sources_name
 | |
| 
 | |
| def get_items_sources(filter_dir=False, r_list=False):
 | |
|     res = _get_dir_source_name(ITEMS_FOLDER, filter_dir=filter_dir)
 | |
|     if res:
 | |
|         if r_list:
 | |
|             res = list(res)
 | |
|         return res
 | |
|     else:
 | |
|         return []
 | |
| 
 | |
| def get_items_by_source(source):
 | |
|     l_items = []
 | |
|     dir_item = os.path.join(os.environ['AIL_HOME'], ITEMS_FOLDER, source)
 | |
|     for root, dirs, files in os.walk(dir_item):
 | |
|         for file in files:
 | |
|             item_id = os.path.join(root, file).replace(ITEMS_FOLDER, '', 1)
 | |
|             l_items.append(item_id)
 | |
|     return l_items
 | |
| 
 | |
| def _manual_set_items_date_first_last():
 | |
|     first = 9999
 | |
|     last = 0
 | |
|     sources = get_items_sources()
 | |
|     for source in sources:
 | |
|         dir_source = os.path.join(os.environ['AIL_HOME'], ITEMS_FOLDER, source)
 | |
|         for dir_name in os.listdir(dir_source):
 | |
|             if os.path.isdir(os.path.join(dir_source, dir_name)):
 | |
|                 date = int(dir_name)
 | |
|                 if date < first:
 | |
|                     first = date
 | |
|                 if date > last:
 | |
|                     last = date
 | |
|     if first != 9999:
 | |
|         update_obj_date(first, 'item')
 | |
|     if last != 0:
 | |
|         update_obj_date(last, 'item')
 | |
| 
 | |
| ################################################################################
 | |
| ################################################################################
 | |
| ################################################################################
 | |
| 
 | |
| def get_nb_items_objects(filters={}):
 | |
|     nb = 0
 | |
|     date_from = filters.get('date_from')
 | |
|     date_to = filters.get('date_to')
 | |
|     if 'sources' in filters:
 | |
|         sources = filters['sources']
 | |
|     else:
 | |
|         sources = get_all_sources()
 | |
|     sources = sorted(sources)
 | |
| 
 | |
|     # date
 | |
|     if date_from and date_to:
 | |
|         daterange = Date.get_daterange(date_from, date_to)
 | |
|     elif date_from:
 | |
|         daterange = Date.get_daterange(date_from, Date.get_today_date_str())
 | |
|     elif date_to:
 | |
|         date_from = get_obj_date_first('item')
 | |
|         daterange = Date.get_daterange(date_from, date_to)
 | |
|     else:
 | |
|         date_from = get_obj_date_first('item')
 | |
|         daterange = Date.get_daterange(date_from, Date.get_today_date_str())
 | |
| 
 | |
|     for source in sources:
 | |
|         for date in daterange:
 | |
|             date = f'{date[0:4]}/{date[4:6]}/{date[6:8]}'
 | |
|             full_dir = os.path.join(ITEMS_FOLDER, source, date)
 | |
|             if not os.path.isdir(full_dir):
 | |
|                 continue
 | |
|             nb += len(os.listdir(full_dir))
 | |
|     return nb
 | |
| 
 | |
| def get_all_items_objects(filters={}):
 | |
|     date_from = filters.get('date_from')
 | |
|     date_to = filters.get('date_to')
 | |
|     if 'sources' in filters:
 | |
|         sources = filters['sources']
 | |
|     else:
 | |
|         sources = get_all_sources()
 | |
|     sources = sorted(sources)
 | |
|     if filters.get('start'):
 | |
|         _, start_id = filters['start'].split(':', 1)
 | |
|         item = Item(start_id)
 | |
|         # remove sources
 | |
|         start_source = item.get_source()
 | |
|         i = 0
 | |
|         while start_source and len(sources) > i:
 | |
|             if sources[i] == start_source:
 | |
|                 sources = sources[i:]
 | |
|                 start_source = None
 | |
|             i += 1
 | |
|         start_date = item.get_date()
 | |
|     else:
 | |
|         start_id = None
 | |
|         start_date = None
 | |
| 
 | |
|     # date
 | |
|     if date_from and date_to:
 | |
|         daterange = Date.get_daterange(date_from, date_to)
 | |
|     elif date_from:
 | |
|         daterange = Date.get_daterange(date_from, Date.get_today_date_str())
 | |
|     elif date_to:
 | |
|         date_from = get_obj_date_first('item')
 | |
|         daterange = Date.get_daterange(date_from, date_to)
 | |
|     else:
 | |
|         date_from = get_obj_date_first('item')
 | |
|         daterange = Date.get_daterange(date_from, Date.get_today_date_str())
 | |
|     if start_date:
 | |
|         if int(start_date) > int(date_from):
 | |
|             i = 0
 | |
|             while start_date and len(daterange) > i:
 | |
|                 if daterange[i] == start_date:
 | |
|                     daterange = daterange[i:]
 | |
|                     start_date = None
 | |
|                 i += 1
 | |
| 
 | |
|     for source in sources:
 | |
|         for date in daterange:
 | |
|             date = f'{date[0:4]}/{date[4:6]}/{date[6:8]}'
 | |
|             full_dir = os.path.join(ITEMS_FOLDER, source, date)
 | |
|             s_dir = os.path.join(source, date)
 | |
|             if not os.path.isdir(full_dir):
 | |
|                 continue
 | |
| 
 | |
|             # TODO replace by os.scandir() ????
 | |
|             all_items = sorted([os.path.join(s_dir, f)
 | |
|                                 for f in os.listdir(full_dir)
 | |
|                                 if os.path.isfile(os.path.join(full_dir, f))])
 | |
|             # start obj id
 | |
|             if start_id:
 | |
|                 i = 0
 | |
|                 while start_id and len(all_items) > i:
 | |
|                     if all_items[i] == start_id:
 | |
|                         if i == len(all_items):
 | |
|                             all_items = []
 | |
|                         else:
 | |
|                             all_items = all_items[i+1:]
 | |
|                         start_id = None
 | |
|                     i += 1
 | |
|             for obj_id in all_items:
 | |
|                 yield Item(obj_id)
 | |
| 
 | |
| ################################################################################
 | |
| ################################################################################
 | |
| ################################################################################
 | |
| 
 | |
| #### API ####
 | |
| 
 | |
| def api_get_item(data):
 | |
|     item_id = data.get('id', None)
 | |
|     if not item_id:
 | |
|         return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
 | |
|     item = Item(item_id)
 | |
|     if not item.exists():
 | |
|         return {'status': 'error', 'reason': 'Item not found'}, 404
 | |
| 
 | |
|     options = set()
 | |
|     if data.get('content'):
 | |
|         options.add('content')
 | |
|     if data.get('crawler'):
 | |
|         options.add('crawler')
 | |
|     if data.get('duplicates'):
 | |
|         options.add('duplicates')
 | |
|     if data.get('lines'):
 | |
|         options.add('lines')
 | |
|     if data.get('mimetype'):
 | |
|         options.add('mimetype')
 | |
|     if data.get('parent'):
 | |
|         options.add('parent')
 | |
|     if data.get('size'):
 | |
|         options.add('size')
 | |
| 
 | |
|     # TODO correlation
 | |
| 
 | |
|     return item.get_meta(options=options), 200
 | |
| 
 | |
| 
 | |
| # -- API -- #
 | |
| 
 | |
| ################################################################################
 | |
| ################################################################################
 | |
| ################################################################################
 | |
| 
 | |
|             # TODO
 | |
| 
 | |
| def exist_item(item_id):
 | |
|     return item_basic.exist_item(item_id)
 | |
| 
 | |
| def get_basename(item_id):
 | |
|     return os.path.basename(item_id)
 | |
| 
 | |
| def get_item_id(full_path):
 | |
|     return full_path.replace(ITEMS_FOLDER, '', 1)
 | |
| 
 | |
| def get_item_filepath(item_id):
 | |
|     return item_basic.get_item_filepath(item_id)
 | |
| 
 | |
| def get_item_date(item_id, add_separator=False):
 | |
|     return item_basic.get_item_date(item_id, add_separator=add_separator)
 | |
| 
 | |
| def get_source(item_id):
 | |
|     return item_basic.get_source(item_id)
 | |
| 
 | |
| def get_all_sources():
 | |
|     return item_basic.get_all_items_sources(r_list=True)
 | |
| 
 | |
| def get_item_basename(item_id):
 | |
|     return os.path.basename(item_id)
 | |
| 
 | |
| def get_item_size(item_id):
 | |
|     return round(os.path.getsize(os.path.join(ITEMS_FOLDER, item_id))/1024.0, 2)
 | |
| 
 | |
| def get_item_encoding(item_id):
 | |
|     return None
 | |
| 
 | |
| def get_lines_info(item_id, item_content=None):
 | |
|     if not item_content:
 | |
|         item_content = get_item_content(item_id)
 | |
|     max_length = 0
 | |
|     line_id = 0
 | |
|     nb_line = 0
 | |
|     for line in item_content.splitlines():
 | |
|         length = len(line)
 | |
|         if length > max_length:
 | |
|             max_length = length
 | |
|         nb_line += 1
 | |
|     return {'nb': nb_line, 'max_length': max_length}
 | |
| 
 | |
| 
 | |
| def get_item_metadata(item_id, item_content=None):
 | |
|     ## TODO: FIXME ##performance
 | |
|     # encoding
 | |
|     # language
 | |
|     # lines info
 | |
|     item_metadata = {'date': get_item_date(item_id, add_separator=True),
 | |
|                      'source': get_source(item_id),
 | |
|                      'size': get_item_size(item_id),
 | |
|                      'encoding': get_item_encoding(item_id),
 | |
|                      'lines': get_lines_info(item_id, item_content=item_content)
 | |
|                      }
 | |
|     return item_metadata
 | |
| 
 | |
| def get_item_content(item_id):
 | |
|     return item_basic.get_item_content(item_id)
 | |
| 
 | |
| def get_item_content_html2text(item_id, item_content=None, ignore_links=False):
 | |
|     if not item_content:
 | |
|         item_content = get_item_content(item_id)
 | |
|     h = html2text.HTML2Text()
 | |
|     h.ignore_links = ignore_links
 | |
|     h.ignore_images = ignore_links
 | |
|     return h.handle(item_content)
 | |
| 
 | |
| def remove_all_urls_from_content(item_id, item_content=None):
 | |
|     if not item_content:
 | |
|         item_content = get_item_content(item_id)
 | |
|     regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
 | |
|     url_regex = re.compile(regex)
 | |
|     urls = url_regex.findall(item_content)
 | |
|     urls = sorted(urls, key=len, reverse=True)
 | |
|     for url in urls:
 | |
|         item_content = item_content.replace(url, '')
 | |
| 
 | |
|     regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
 | |
|     regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
 | |
|     regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
 | |
|     re.compile(regex_pgp_public_blocs)
 | |
|     re.compile(regex_pgp_signature)
 | |
|     re.compile(regex_pgp_message)
 | |
| 
 | |
|     res = re.findall(regex_pgp_public_blocs, item_content)
 | |
|     for it in res:
 | |
|         item_content = item_content.replace(it, '')
 | |
|     res = re.findall(regex_pgp_signature, item_content)
 | |
|     for it in res:
 | |
|         item_content = item_content.replace(it, '')
 | |
|     res = re.findall(regex_pgp_message, item_content)
 | |
|     for it in res:
 | |
|         item_content = item_content.replace(it, '')
 | |
| 
 | |
|     return item_content
 | |
| 
 | |
| def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
 | |
|     all_languages = []
 | |
| 
 | |
|     ## CLEAN CONTENT ##
 | |
|     content = get_item_content_html2text(item_id, ignore_links=True)
 | |
|     content = remove_all_urls_from_content(item_id, item_content=content)
 | |
| 
 | |
|     # REMOVE USELESS SPACE
 | |
|     content = ' '.join(content.split())
 | |
|     #- CLEAN CONTENT -#
 | |
| 
 | |
|     #print(content)
 | |
|     #print(len(content))
 | |
|     if len(content) >= min_len:
 | |
|         for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
 | |
|             if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
 | |
|                 all_languages.append(lang)
 | |
|     return all_languages
 | |
| 
 | |
| # API
 | |
| # def get_item(request_dict):
 | |
| #     if not request_dict:
 | |
| #         return {'status': 'error', 'reason': 'Malformed JSON'}, 400
 | |
| #
 | |
| #     item_id = request_dict.get('id', None)
 | |
| #     if not item_id:
 | |
| #         return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
 | |
| #     if not exist_item(item_id):
 | |
| #         return {'status': 'error', 'reason': 'Item not found'}, 404
 | |
| #
 | |
| #     dict_item = {}
 | |
| #     dict_item['id'] = item_id
 | |
| #     date = request_dict.get('date', True)
 | |
| #     if date:
 | |
| #         add_separator = False
 | |
| #         if request_dict.get('date_separator', False):
 | |
| #             add_separator = True
 | |
| #         dict_item['date'] = get_item_date(item_id, add_separator=add_separator)
 | |
| #     tags = request_dict.get('tags', True)
 | |
| #     if tags:
 | |
| #         dict_item['tags'] = Tag.get_object_tags('item', item_id)
 | |
| #
 | |
| #     size = request_dict.get('size', False)
 | |
| #     if size:
 | |
| #         dict_item['size'] = get_item_size(item_id)
 | |
| #
 | |
| #     content = request_dict.get('content', False)
 | |
| #     if content:
 | |
| #         # UTF-8 outpout, # TODO: use base64
 | |
| #         dict_item['content'] = get_item_content(item_id)
 | |
| #
 | |
| #     raw_content = request_dict.get('raw_content', False)
 | |
| #     if raw_content:
 | |
| #         dict_item['raw_content'] = get_raw_content(item_id)
 | |
| #
 | |
| #     lines_info = request_dict.get('lines', False)
 | |
| #     if lines_info:
 | |
| #         dict_item['lines'] = get_lines_info(item_id, dict_item.get('content', 'None'))
 | |
| #
 | |
| #     if request_dict.get('pgp'):
 | |
| #         dict_item['pgp'] = {}
 | |
| #         if request_dict['pgp'].get('key'):
 | |
| #             dict_item['pgp']['key'] = get_item_pgp_key(item_id)
 | |
| #         if request_dict['pgp'].get('mail'):
 | |
| #             dict_item['pgp']['mail'] = get_item_pgp_mail(item_id)
 | |
| #         if request_dict['pgp'].get('name'):
 | |
| #             dict_item['pgp']['name'] = get_item_pgp_name(item_id)
 | |
| #
 | |
| #     if request_dict.get('cryptocurrency'):
 | |
| #         dict_item['cryptocurrency'] = {}
 | |
| #         if request_dict['cryptocurrency'].get('bitcoin'):
 | |
| #             dict_item['cryptocurrency']['bitcoin'] = get_item_bitcoin(item_id)
 | |
| #
 | |
| #     return dict_item, 200
 | |
| 
 | |
| 
 | |
| 
 | |
| def api_get_item_content_base64_utf8(request_dict):
 | |
|     item_id = request_dict.get('id', None)
 | |
|     if not request_dict:
 | |
|         return {'status': 'error', 'reason': 'Malformed JSON'}, 400
 | |
|     if not item_id:
 | |
|         return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
 | |
|     if not exist_item(item_id):
 | |
|         return {'status': 'error', 'reason': 'Item not found'}, 404
 | |
| 
 | |
|     item_content = get_item_content(item_id)
 | |
|     item_content = base64.b64encode((item_content.encode('utf-8'))).decode('UTF-8')
 | |
|     return {'status': 'success', 'content': item_content}, 200
 | |
| 
 | |
| 
 | |
| def api_get_items_sources():
 | |
|     item_content = {'sources': get_all_sources()}
 | |
|     return item_content, 200
 | |
| 
 | |
| # def check_item_source(request_dict):
 | |
| #     source = request_dict.get('source', None)
 | |
| #     if not request_dict:
 | |
| #         return {'status': 'error', 'reason': 'Malformed JSON'}, 400
 | |
| #     if not source:
 | |
| #         return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
 | |
| #
 | |
| #     all_sources = item_basic.get_all_items_sources()
 | |
| #
 | |
| #     if source not in all_sources:
 | |
| #         return {'status': 'error', 'reason': 'Invalid source', 'provide': source}, 400
 | |
| #     return {'status': 'success', 'reason': 'Valid source', 'provide': source}, 200
 | |
| 
 | |
| 
 | |
| ###
 | |
| ### GET Internal Module DESC
 | |
| ###
 | |
| def get_item_list_desc(list_item_id):
 | |
|     desc_list = []
 | |
|     for item_id in list_item_id:
 | |
|         item = Item(item_id)
 | |
|         desc_list.append( {'id': item_id, 'date': get_item_date(item_id), 'tags': item.get_tags(r_list=True)})
 | |
|     return desc_list
 | |
| 
 | |
| def is_crawled(item_id):
 | |
|     return item_basic.is_crawled(item_id)
 | |
| 
 | |
| def is_onion(item_id):
 | |
|     is_onion = False
 | |
|     if len(is_onion) > 62:
 | |
|         if is_crawled(item_id) and item_id[-42:-36] == '.onion':
 | |
|             is_onion = True
 | |
|     return is_onion
 | |
| 
 | |
| def is_item_in_domain(domain, item_id):
 | |
|     is_in_domain = False
 | |
|     domain_lenght = len(domain)
 | |
|     if len(item_id) > (domain_lenght+48):
 | |
|         if item_id[-36-domain_lenght:-36] == domain:
 | |
|             is_in_domain = True
 | |
|     return is_in_domain
 | |
| 
 | |
| def get_item_domain(item_id):
 | |
|     return item_basic.get_item_domain(item_id)
 | |
| 
 | |
| def get_domain(item_id):
 | |
|     item_id = item_id.split('/')
 | |
|     item_id = item_id[-1]
 | |
|     return item_id[:-36]
 | |
| 
 | |
| # TODO MOVE ME
 | |
| def get_item_har_name(item_id):
 | |
|     har_path = os.path.join(har_directory, item_id) + '.json'
 | |
|     if os.path.isfile(har_path):
 | |
|         return har_path
 | |
|     else:
 | |
|         return None
 | |
| 
 | |
| def get_item_filename(item_id):
 | |
|     # Creating the full filepath
 | |
|     filename = os.path.join(ITEMS_FOLDER, item_id)
 | |
|     filename = os.path.realpath(filename)
 | |
| 
 | |
|     # incorrect filename
 | |
|     if not os.path.commonprefix([filename, ITEMS_FOLDER]) == ITEMS_FOLDER:
 | |
|         return None
 | |
|     else:
 | |
|         return filename
 | |
| 
 | |
| def get_raw_content(item_id):
 | |
|     filepath = get_item_filepath(item_id)
 | |
|     with open(filepath, 'rb') as f:
 | |
|         file_content = BytesIO(f.read())
 | |
|     return file_content
 | |
| 
 | |
| def save_raw_content(item_id, io_content):
 | |
|     filepath = get_item_filename(item_id)
 | |
|     if os.path.isfile(filepath):
 | |
|         #print('File already exist')
 | |
|         return False
 | |
|     # create subdir
 | |
|     dirname = os.path.dirname(filepath)
 | |
|     if not os.path.exists(dirname):
 | |
|         os.makedirs(dirname)
 | |
|     # # TODO: check if is IO file
 | |
|     with open(filepath, 'wb') as f:
 | |
|         f.write(io_content.getvalue())
 | |
|     return True
 | |
| 
 | |
| # IDEA: send item to duplicate ?
 | |
| def create_item(obj_id, obj_metadata, io_content):
 | |
|     '''
 | |
|     Create a new Item (Import or Test only).
 | |
| 
 | |
|     :param obj_id: item id
 | |
|     :type obj_metadata: dict - 'first_seen', 'tags'
 | |
| 
 | |
|     :return: is item created
 | |
|     :rtype: boolean
 | |
|     '''
 | |
|     # check if datetime match ??
 | |
| 
 | |
| 
 | |
|     # # TODO: validate obj_id
 | |
| 
 | |
|     res = save_raw_content(obj_id, io_content)
 | |
|     # item saved
 | |
|     if res:
 | |
|         # creata tags
 | |
|         if 'tags' in obj_metadata:
 | |
|             item = Item(obj_id)
 | |
|             # # TODO: handle mixed tags: taxonomies and Galaxies
 | |
|             # for tag in obj_metadata['tags']:
 | |
|             #     item.add_tag(tag)
 | |
|         return True
 | |
| 
 | |
|     # Item not created
 | |
|     return False
 | |
| 
 | |
|     # # check if item exists
 | |
|     # if not exist_item(obj_id):
 | |
|     #     return False
 | |
|     # else:
 | |
|     #     delete_item_duplicate(obj_id)
 | |
|     #     # delete MISP event
 | |
|     #     r_s_metadata.delete('misp_events:{}'.format(obj_id))
 | |
|     #     r_s_metadata.delete('hive_cases:{}'.format(obj_id))
 | |
|     #
 | |
|     #     os.remove(get_item_filename(obj_id))
 | |
|     #
 | |
|     #     # get all correlation
 | |
|     #     obj_correlations = get_item_all_correlation(obj_id)
 | |
|     #     for correlation in obj_correlations:
 | |
|     #         if correlation=='cryptocurrency' or correlation=='pgp':
 | |
|     #             for obj2_subtype in obj_correlations[correlation]:
 | |
|     #                 for obj2_id in obj_correlations[correlation][obj2_subtype]:
 | |
|     #                     Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id,
 | |
|     #                                                         obj1_subtype=obj2_subtype)
 | |
|     #         else:
 | |
|     #             for obj2_id in obj_correlations[correlation]:
 | |
|     #                 Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id)
 | |
|     #
 | |
|     #     # delete father/child
 | |
|     #     delete_node(obj_id)
 | |
|     #
 | |
|     #     # delete item metadata
 | |
|     #
 | |
|     #     return True
 | |
|     #
 | |
|     # ### TODO in inport V2
 | |
|     # # delete from tracked items
 | |
|     #
 | |
|     # # # # TODO: # FIXME: LATER
 | |
|     # # delete from queue
 | |
|     # ###
 | |
|     # return False
 | |
| 
 | |
| #### ####
 | |
| # def delete_node(item_id):
 | |
| #     if is_node(item_id):
 | |
| #         if is_crawled(item_id):
 | |
| #             delete_domain_node(item_id)
 | |
| #         item_basic._delete_node(item_id)
 | |
| #
 | |
| # def delete_domain_node(item_id):
 | |
| #     if is_domain_root(item_id):
 | |
| #         # remove from domain history
 | |
| #         domain, port = get_item_domain_with_port(item_id).split(':')
 | |
| #         domain_basic.delete_domain_item_core(item_id, domain, port)
 | |
| #     for child_id in get_all_domain_node_by_item_id(item_id):
 | |
| #         delete_item(child_id)
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
| #     content = 'test file content'
 | |
| #     duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
 | |
| #
 | |
| #     item = Item('tests/2020/01/02/test_save.gz')
 | |
| #     item.create(content, _save=False)
 | |
|     filters = {'date_from': '20230101', 'date_to': '20230501', 'sources': ['crawled', 'submitted'], 'start': ':submitted/2023/04/28/submitted_2b3dd861-a75d-48e4-8cec-6108d41450da.gz'}
 | |
|     gen = get_all_items_objects(filters=filters)
 | |
|     for obj_id in gen:
 | |
|         print(obj_id.id)
 |