mirror of https://github.com/CIRCL/AIL-framework
933 lines
30 KiB
Python
Executable File
933 lines
30 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*-coding:UTF-8 -*
|
|
|
|
import base64
|
|
import gzip
|
|
import magic
|
|
import os
|
|
import re
|
|
import sys
|
|
import cld3
|
|
import html2text
|
|
|
|
from io import BytesIO
|
|
|
|
from pymisp import MISPObject
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
##################################
|
|
# Import Project packages
|
|
##################################
|
|
from lib.ail_core import get_ail_uuid
|
|
from lib.objects.abstract_object import AbstractObject
|
|
from lib.ConfigLoader import ConfigLoader
|
|
from lib import item_basic
|
|
from lib.data_retention_engine import update_obj_date, get_obj_date_first
|
|
from packages import Date
|
|
|
|
|
|
from flask import url_for
|
|
|
|
config_loader = ConfigLoader()
|
|
# # TODO: get and sanitize ITEMS DIRECTORY
|
|
ITEMS_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
|
|
ITEMS_FOLDER = os.path.join(os.path.realpath(ITEMS_FOLDER), '')
|
|
|
|
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
r_object = config_loader.get_db_conn("Kvrocks_Objects")
|
|
screenshot_directory = config_loader.get_files_directory('screenshot')
|
|
har_directory = config_loader.get_files_directory('har')
|
|
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
|
|
config_loader = None
|
|
|
|
|
|
class Item(AbstractObject):
|
|
"""
|
|
AIL Item Object. (strings)
|
|
"""
|
|
|
|
def __init__(self, id):
|
|
super(Item, self).__init__('item', id)
|
|
|
|
def exists(self):
|
|
return item_basic.exist_item(self.id)
|
|
|
|
def get_date(self, separator=False):
|
|
"""
|
|
Returns Item date
|
|
"""
|
|
return item_basic.get_item_date(self.id, add_separator=separator)
|
|
|
|
def get_source(self):
|
|
"""
|
|
Returns Item source/feeder name
|
|
"""
|
|
# return self.id.split('/')[-5]
|
|
l_source = self.id.split('/')[:-4]
|
|
return os.path.join(*l_source)
|
|
|
|
def get_basename(self):
|
|
return os.path.basename(self.id)
|
|
|
|
def get_filename(self):
|
|
# Creating the full filepath
|
|
filename = os.path.join(ITEMS_FOLDER, self.id)
|
|
filename = os.path.realpath(filename)
|
|
|
|
# incorrect filename
|
|
if not os.path.commonprefix([filename, ITEMS_FOLDER]) == ITEMS_FOLDER:
|
|
return None
|
|
else:
|
|
return filename
|
|
|
|
def get_content(self, r_type='str'):
|
|
"""
|
|
Returns Item content
|
|
"""
|
|
if r_type == 'str':
|
|
return item_basic.get_item_content(self.id)
|
|
elif r_type == 'bytes':
|
|
return item_basic.get_item_content_binary(self.id)
|
|
|
|
def get_raw_content(self, decompress=False):
|
|
filepath = self.get_filename()
|
|
if decompress:
|
|
raw_content = BytesIO(self.get_content(r_type='bytes'))
|
|
else:
|
|
with open(filepath, 'rb') as f:
|
|
raw_content = BytesIO(f.read())
|
|
return raw_content
|
|
|
|
def get_gzip_content(self, b64=False):
|
|
with open(self.get_filename(), 'rb') as f:
|
|
content = f.read()
|
|
if b64:
|
|
content = base64.b64encode(content)
|
|
return content.decode()
|
|
|
|
def get_html2text_content(self, content=None, ignore_links=False):
|
|
if not content:
|
|
content = self.get_content()
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = ignore_links
|
|
h.ignore_images = ignore_links
|
|
return h.handle(content)
|
|
|
|
def get_size(self, r_str=False):
|
|
size = os.path.getsize(self.get_filename())/1024.0
|
|
if r_str:
|
|
size = round(size, 2)
|
|
return size
|
|
|
|
def get_ail_2_ail_payload(self):
|
|
payload = {'raw': self.get_gzip_content(b64=True)}
|
|
return payload
|
|
|
|
def get_parent(self):
|
|
return item_basic.get_item_parent(self.id)
|
|
|
|
def set_parent(self, parent_id):
|
|
r_object.sadd(f'child:item::{parent_id}', self.id)
|
|
r_object.hset(f'meta:item::{self.id}', 'parent', parent_id)
|
|
|
|
def add_children(self, child_id):
|
|
r_object.sadd(f'child:item::{self.id}', child_id)
|
|
r_object.hset(f'meta:item::{child_id}', 'parent', self.id)
|
|
|
|
####################################################################################
|
|
####################################################################################
|
|
|
|
def sanitize_id(self):
|
|
pass
|
|
|
|
|
|
# # TODO: sanitize_id
|
|
# # TODO: check if already exists ?
|
|
# # TODO: check if duplicate
|
|
def save_on_disk(self, content, binary=True, compressed=False, b64=False):
|
|
if not binary:
|
|
content = content.encode()
|
|
if b64:
|
|
content = base64.standard_b64decode(content)
|
|
if not compressed:
|
|
content = gzip.compress(content)
|
|
|
|
# # TODO: # FIXME: raise Exception id filename is None ######
|
|
filename = self.get_filename()
|
|
dirname = os.path.dirname(filename)
|
|
if not os.path.exists(dirname):
|
|
os.makedirs(dirname)
|
|
with open(filename, 'wb') as f:
|
|
f.write(content)
|
|
|
|
# # TODO:
|
|
# correlations
|
|
# content
|
|
# tags
|
|
# origin
|
|
# duplicate -> all item iterations ???
|
|
#
|
|
def create(self, content, tags, father=None, duplicates=[], _save=True):
|
|
if _save:
|
|
self.save_on_disk(content, binary=True, compressed=False, base64=False)
|
|
|
|
# # TODO:
|
|
# for tag in tags:
|
|
# self.add_tag(tag)
|
|
|
|
if father:
|
|
pass
|
|
|
|
for obj_id in duplicates:
|
|
for dup in duplicates[obj_id]:
|
|
self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
|
|
|
|
|
|
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
|
|
# TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
|
|
def delete(self):
|
|
self._delete()
|
|
try:
|
|
os.remove(self.get_filename())
|
|
return True
|
|
except FileNotFoundError:
|
|
return False
|
|
|
|
####################################################################################
|
|
####################################################################################
|
|
|
|
def get_link(self, flask_context=False):
|
|
if flask_context:
|
|
url = url_for('objects_item.showItem', id=self.id)
|
|
else:
|
|
url = f'{baseurl}/object/item?id={self.id}'
|
|
return url
|
|
|
|
def get_svg_icon(self):
|
|
if is_crawled(self.id):
|
|
color = 'red'
|
|
else:
|
|
color = '#332288'
|
|
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
|
|
|
|
def get_misp_object(self):
|
|
obj_date = self.get_date()
|
|
obj = MISPObject('ail-leak', standalone=True)
|
|
obj.first_seen = obj_date
|
|
|
|
obj_attrs = [obj.add_attribute('first-seen', value=obj_date),
|
|
obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()),
|
|
obj.add_attribute('sensor', value=get_ail_uuid())]
|
|
for obj_attr in obj_attrs:
|
|
for tag in self.get_tags():
|
|
obj_attr.add_tag(tag)
|
|
return obj
|
|
|
|
def exist_correlation(self):
|
|
pass
|
|
|
|
def is_crawled(self):
|
|
return self.id.startswith('crawled')
|
|
|
|
# if is_crawled
|
|
def get_domain(self):
|
|
return self.id[19:-36]
|
|
|
|
def get_screenshot(self):
|
|
s = self.get_correlation('screenshot')
|
|
if s.get('screenshot'):
|
|
s = s['screenshot'].pop()[1:]
|
|
return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:])
|
|
|
|
def get_har(self):
|
|
har_path = os.path.join(har_directory, self.id) + '.json'
|
|
if os.path.isfile(har_path):
|
|
return har_path
|
|
else:
|
|
return None
|
|
|
|
def get_url(self):
|
|
return r_object.hget(f'meta:item::{self.id}', 'url')
|
|
|
|
def set_crawled(self, url, parent_id):
|
|
r_object.hset(f'meta:item::{self.id}', 'url', url)
|
|
self.set_parent(parent_id)
|
|
|
|
# options: set of optional meta fields
|
|
def get_meta(self, options=None):
|
|
"""
|
|
:type options: set
|
|
"""
|
|
if options is None:
|
|
options = set()
|
|
meta = {'id': self.id,
|
|
'date': self.get_date(separator=True),
|
|
'source': self.get_source(),
|
|
'tags': self.get_tags(r_list=True)}
|
|
# optional meta fields
|
|
if 'content' in options:
|
|
meta['content'] = self.get_content()
|
|
if 'crawler' in options:
|
|
if self.is_crawled():
|
|
tags = meta.get('tags')
|
|
meta['crawler'] = self.get_meta_crawler(tags=tags)
|
|
if 'duplicates' in options:
|
|
meta['duplicates'] = self.get_duplicates()
|
|
if 'lines' in options:
|
|
content = meta.get('content')
|
|
meta['lines'] = self.get_meta_lines(content=content)
|
|
if 'parent' in options:
|
|
meta['parent'] = self.get_parent()
|
|
if 'size' in options:
|
|
meta['size'] = self.get_size(r_str=True)
|
|
if 'mimetype' in options:
|
|
content = meta.get('content')
|
|
meta['mimetype'] = self.get_mimetype(content=content)
|
|
|
|
# meta['encoding'] = None
|
|
return meta
|
|
|
|
def get_meta_crawler(self, tags=None):
|
|
"""
|
|
:type tags: list
|
|
"""
|
|
if tags is None:
|
|
tags = []
|
|
crawler = {}
|
|
if self.is_crawled():
|
|
crawler['domain'] = self.get_domain()
|
|
crawler['har'] = self.get_har()
|
|
crawler['screenshot'] = self.get_screenshot()
|
|
crawler['url'] = self.get_url()
|
|
if not tags:
|
|
tags = self.get_tags()
|
|
crawler['is_tags_safe'] = self.is_tags_safe(tags)
|
|
return crawler
|
|
|
|
def get_meta_lines(self, content=None):
|
|
if not content:
|
|
content = self.get_content()
|
|
max_length = 0
|
|
nb_line = 0
|
|
for line in content.splitlines():
|
|
length = len(line)
|
|
if length > max_length:
|
|
max_length = length
|
|
nb_line += 1
|
|
return {'nb': nb_line, 'max_length': max_length}
|
|
|
|
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
|
all_languages = []
|
|
## CLEAN CONTENT ##
|
|
content = self.get_html2text_content(ignore_links=True)
|
|
content = remove_all_urls_from_content(self.id, item_content=content) ##########################################
|
|
# REMOVE USELESS SPACE
|
|
content = ' '.join(content.split())
|
|
#- CLEAN CONTENT -#
|
|
#print(content)
|
|
#print(len(content))
|
|
if len(content) >= min_len: # # TODO: # FIXME: check num langs limit
|
|
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
|
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
|
all_languages.append(lang)
|
|
return all_languages
|
|
|
|
def get_mimetype(self, content=None):
|
|
if not content:
|
|
content = self.get_content()
|
|
return magic.from_buffer(content, mime=True)
|
|
|
|
############################################################################
|
|
############################################################################
|
|
|
|
def _get_dir_source_name(directory, source_name=None, l_sources_name=None, filter_dir=False):
|
|
"""
|
|
:type l_sources_name: set
|
|
"""
|
|
if not l_sources_name:
|
|
l_sources_name = set()
|
|
if source_name:
|
|
l_dir = os.listdir(os.path.join(directory, source_name))
|
|
else:
|
|
l_dir = os.listdir(directory)
|
|
# empty directory
|
|
if not l_dir:
|
|
return l_sources_name.add(source_name)
|
|
else:
|
|
for src_name in l_dir:
|
|
if len(src_name) == 4:
|
|
# try:
|
|
int(src_name)
|
|
to_add = os.path.join(source_name)
|
|
# filter sources, remove first directory
|
|
if filter_dir:
|
|
to_add = to_add.replace('archive/', '').replace('alerts/', '')
|
|
l_sources_name.add(to_add)
|
|
return l_sources_name
|
|
# except:
|
|
# pass
|
|
if source_name:
|
|
src_name = os.path.join(source_name, src_name)
|
|
l_sources_name = _get_dir_source_name(directory, source_name=src_name, l_sources_name=l_sources_name, filter_dir=filter_dir)
|
|
return l_sources_name
|
|
|
|
def get_items_sources(filter_dir=False, r_list=False):
|
|
res = _get_dir_source_name(ITEMS_FOLDER, filter_dir=filter_dir)
|
|
if res:
|
|
if r_list:
|
|
res = list(res)
|
|
return res
|
|
else:
|
|
return []
|
|
|
|
def get_items_by_source(source):
|
|
l_items = []
|
|
dir_item = os.path.join(os.environ['AIL_HOME'], ITEMS_FOLDER, source)
|
|
for root, dirs, files in os.walk(dir_item):
|
|
for file in files:
|
|
item_id = os.path.join(root, file).replace(ITEMS_FOLDER, '', 1)
|
|
l_items.append(item_id)
|
|
return l_items
|
|
|
|
def _manual_set_items_date_first_last():
|
|
first = 9999
|
|
last = 0
|
|
sources = get_items_sources()
|
|
for source in sources:
|
|
dir_source = os.path.join(os.environ['AIL_HOME'], ITEMS_FOLDER, source)
|
|
for dir_name in os.listdir(dir_source):
|
|
if os.path.isdir(os.path.join(dir_source, dir_name)):
|
|
date = int(dir_name)
|
|
if date < first:
|
|
first = date
|
|
if date > last:
|
|
last = date
|
|
if first != 9999:
|
|
update_obj_date(first, 'item')
|
|
if last != 0:
|
|
update_obj_date(last, 'item')
|
|
|
|
################################################################################
|
|
################################################################################
|
|
################################################################################
|
|
|
|
def get_nb_items_objects(filters={}):
|
|
nb = 0
|
|
date_from = filters.get('date_from')
|
|
date_to = filters.get('date_to')
|
|
if 'sources' in filters:
|
|
sources = filters['sources']
|
|
else:
|
|
sources = get_all_sources()
|
|
sources = sorted(sources)
|
|
|
|
# date
|
|
if date_from and date_to:
|
|
daterange = Date.get_daterange(date_from, date_to)
|
|
elif date_from:
|
|
daterange = Date.get_daterange(date_from, Date.get_today_date_str())
|
|
elif date_to:
|
|
date_from = get_obj_date_first('item')
|
|
daterange = Date.get_daterange(date_from, date_to)
|
|
else:
|
|
date_from = get_obj_date_first('item')
|
|
daterange = Date.get_daterange(date_from, Date.get_today_date_str())
|
|
|
|
for source in sources:
|
|
for date in daterange:
|
|
date = f'{date[0:4]}/{date[4:6]}/{date[6:8]}'
|
|
full_dir = os.path.join(ITEMS_FOLDER, source, date)
|
|
if not os.path.isdir(full_dir):
|
|
continue
|
|
nb += len(os.listdir(full_dir))
|
|
return nb
|
|
|
|
def get_all_items_objects(filters={}):
|
|
date_from = filters.get('date_from')
|
|
date_to = filters.get('date_to')
|
|
if 'sources' in filters:
|
|
sources = filters['sources']
|
|
else:
|
|
sources = get_all_sources()
|
|
sources = sorted(sources)
|
|
if filters.get('start'):
|
|
_, start_id = filters['start'].split(':', 1)
|
|
item = Item(start_id)
|
|
# remove sources
|
|
start_source = item.get_source()
|
|
i = 0
|
|
while start_source and len(sources) > i:
|
|
if sources[i] == start_source:
|
|
sources = sources[i:]
|
|
start_source = None
|
|
i += 1
|
|
start_date = item.get_date()
|
|
else:
|
|
start_id = None
|
|
start_date = None
|
|
|
|
# date
|
|
if date_from and date_to:
|
|
daterange = Date.get_daterange(date_from, date_to)
|
|
elif date_from:
|
|
daterange = Date.get_daterange(date_from, Date.get_today_date_str())
|
|
elif date_to:
|
|
date_from = get_obj_date_first('item')
|
|
daterange = Date.get_daterange(date_from, date_to)
|
|
else:
|
|
date_from = get_obj_date_first('item')
|
|
daterange = Date.get_daterange(date_from, Date.get_today_date_str())
|
|
if start_date:
|
|
if int(start_date) > int(date_from):
|
|
i = 0
|
|
while start_date and len(daterange) > i:
|
|
if daterange[i] == start_date:
|
|
daterange = daterange[i:]
|
|
start_date = None
|
|
i += 1
|
|
|
|
for source in sources:
|
|
for date in daterange:
|
|
date = f'{date[0:4]}/{date[4:6]}/{date[6:8]}'
|
|
full_dir = os.path.join(ITEMS_FOLDER, source, date)
|
|
s_dir = os.path.join(source, date)
|
|
if not os.path.isdir(full_dir):
|
|
continue
|
|
|
|
# TODO replace by os.scandir() ????
|
|
all_items = sorted([os.path.join(s_dir, f)
|
|
for f in os.listdir(full_dir)
|
|
if os.path.isfile(os.path.join(full_dir, f))])
|
|
# start obj id
|
|
if start_id:
|
|
i = 0
|
|
while start_id and len(all_items) > i:
|
|
if all_items[i] == start_id:
|
|
if i == len(all_items):
|
|
all_items = []
|
|
else:
|
|
all_items = all_items[i+1:]
|
|
start_id = None
|
|
i += 1
|
|
for obj_id in all_items:
|
|
yield Item(obj_id)
|
|
|
|
################################################################################
|
|
################################################################################
|
|
################################################################################
|
|
|
|
#### API ####
|
|
|
|
def api_get_item(data):
|
|
item_id = data.get('id', None)
|
|
if not item_id:
|
|
return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
|
|
item = Item(item_id)
|
|
if not item.exists():
|
|
return {'status': 'error', 'reason': 'Item not found'}, 404
|
|
|
|
options = set()
|
|
if data.get('content'):
|
|
options.add('content')
|
|
if data.get('crawler'):
|
|
options.add('crawler')
|
|
if data.get('duplicates'):
|
|
options.add('duplicates')
|
|
if data.get('lines'):
|
|
options.add('lines')
|
|
if data.get('mimetype'):
|
|
options.add('mimetype')
|
|
if data.get('parent'):
|
|
options.add('parent')
|
|
if data.get('size'):
|
|
options.add('size')
|
|
|
|
# TODO correlation
|
|
|
|
return item.get_meta(options=options), 200
|
|
|
|
|
|
# -- API -- #
|
|
|
|
################################################################################
|
|
################################################################################
|
|
################################################################################
|
|
|
|
# TODO
|
|
|
|
def exist_item(item_id):
|
|
return item_basic.exist_item(item_id)
|
|
|
|
def get_basename(item_id):
|
|
return os.path.basename(item_id)
|
|
|
|
def get_item_id(full_path):
|
|
return full_path.replace(ITEMS_FOLDER, '', 1)
|
|
|
|
def get_item_filepath(item_id):
|
|
return item_basic.get_item_filepath(item_id)
|
|
|
|
def get_item_date(item_id, add_separator=False):
|
|
return item_basic.get_item_date(item_id, add_separator=add_separator)
|
|
|
|
def get_source(item_id):
|
|
return item_basic.get_source(item_id)
|
|
|
|
def get_all_sources():
|
|
return item_basic.get_all_items_sources(r_list=True)
|
|
|
|
def get_item_basename(item_id):
|
|
return os.path.basename(item_id)
|
|
|
|
def get_item_size(item_id):
|
|
return round(os.path.getsize(os.path.join(ITEMS_FOLDER, item_id))/1024.0, 2)
|
|
|
|
def get_item_encoding(item_id):
|
|
return None
|
|
|
|
def get_lines_info(item_id, item_content=None):
|
|
if not item_content:
|
|
item_content = get_item_content(item_id)
|
|
max_length = 0
|
|
line_id = 0
|
|
nb_line = 0
|
|
for line in item_content.splitlines():
|
|
length = len(line)
|
|
if length > max_length:
|
|
max_length = length
|
|
nb_line += 1
|
|
return {'nb': nb_line, 'max_length': max_length}
|
|
|
|
|
|
def get_item_metadata(item_id, item_content=None):
|
|
## TODO: FIXME ##performance
|
|
# encoding
|
|
# language
|
|
# lines info
|
|
item_metadata = {'date': get_item_date(item_id, add_separator=True),
|
|
'source': get_source(item_id),
|
|
'size': get_item_size(item_id),
|
|
'encoding': get_item_encoding(item_id),
|
|
'lines': get_lines_info(item_id, item_content=item_content)
|
|
}
|
|
return item_metadata
|
|
|
|
def get_item_content(item_id):
|
|
return item_basic.get_item_content(item_id)
|
|
|
|
def get_item_content_html2text(item_id, item_content=None, ignore_links=False):
|
|
if not item_content:
|
|
item_content = get_item_content(item_id)
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = ignore_links
|
|
h.ignore_images = ignore_links
|
|
return h.handle(item_content)
|
|
|
|
def remove_all_urls_from_content(item_id, item_content=None):
|
|
if not item_content:
|
|
item_content = get_item_content(item_id)
|
|
regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
|
|
url_regex = re.compile(regex)
|
|
urls = url_regex.findall(item_content)
|
|
urls = sorted(urls, key=len, reverse=True)
|
|
for url in urls:
|
|
item_content = item_content.replace(url, '')
|
|
|
|
regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
|
|
regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
|
|
regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
|
|
re.compile(regex_pgp_public_blocs)
|
|
re.compile(regex_pgp_signature)
|
|
re.compile(regex_pgp_message)
|
|
|
|
res = re.findall(regex_pgp_public_blocs, item_content)
|
|
for it in res:
|
|
item_content = item_content.replace(it, '')
|
|
res = re.findall(regex_pgp_signature, item_content)
|
|
for it in res:
|
|
item_content = item_content.replace(it, '')
|
|
res = re.findall(regex_pgp_message, item_content)
|
|
for it in res:
|
|
item_content = item_content.replace(it, '')
|
|
|
|
return item_content
|
|
|
|
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
|
all_languages = []
|
|
|
|
## CLEAN CONTENT ##
|
|
content = get_item_content_html2text(item_id, ignore_links=True)
|
|
content = remove_all_urls_from_content(item_id, item_content=content)
|
|
|
|
# REMOVE USELESS SPACE
|
|
content = ' '.join(content.split())
|
|
#- CLEAN CONTENT -#
|
|
|
|
#print(content)
|
|
#print(len(content))
|
|
if len(content) >= min_len:
|
|
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
|
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
|
all_languages.append(lang)
|
|
return all_languages
|
|
|
|
# API
|
|
# def get_item(request_dict):
|
|
# if not request_dict:
|
|
# return {'status': 'error', 'reason': 'Malformed JSON'}, 400
|
|
#
|
|
# item_id = request_dict.get('id', None)
|
|
# if not item_id:
|
|
# return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
|
|
# if not exist_item(item_id):
|
|
# return {'status': 'error', 'reason': 'Item not found'}, 404
|
|
#
|
|
# dict_item = {}
|
|
# dict_item['id'] = item_id
|
|
# date = request_dict.get('date', True)
|
|
# if date:
|
|
# add_separator = False
|
|
# if request_dict.get('date_separator', False):
|
|
# add_separator = True
|
|
# dict_item['date'] = get_item_date(item_id, add_separator=add_separator)
|
|
# tags = request_dict.get('tags', True)
|
|
# if tags:
|
|
# dict_item['tags'] = Tag.get_object_tags('item', item_id)
|
|
#
|
|
# size = request_dict.get('size', False)
|
|
# if size:
|
|
# dict_item['size'] = get_item_size(item_id)
|
|
#
|
|
# content = request_dict.get('content', False)
|
|
# if content:
|
|
# # UTF-8 outpout, # TODO: use base64
|
|
# dict_item['content'] = get_item_content(item_id)
|
|
#
|
|
# raw_content = request_dict.get('raw_content', False)
|
|
# if raw_content:
|
|
# dict_item['raw_content'] = get_raw_content(item_id)
|
|
#
|
|
# lines_info = request_dict.get('lines', False)
|
|
# if lines_info:
|
|
# dict_item['lines'] = get_lines_info(item_id, dict_item.get('content', 'None'))
|
|
#
|
|
# if request_dict.get('pgp'):
|
|
# dict_item['pgp'] = {}
|
|
# if request_dict['pgp'].get('key'):
|
|
# dict_item['pgp']['key'] = get_item_pgp_key(item_id)
|
|
# if request_dict['pgp'].get('mail'):
|
|
# dict_item['pgp']['mail'] = get_item_pgp_mail(item_id)
|
|
# if request_dict['pgp'].get('name'):
|
|
# dict_item['pgp']['name'] = get_item_pgp_name(item_id)
|
|
#
|
|
# if request_dict.get('cryptocurrency'):
|
|
# dict_item['cryptocurrency'] = {}
|
|
# if request_dict['cryptocurrency'].get('bitcoin'):
|
|
# dict_item['cryptocurrency']['bitcoin'] = get_item_bitcoin(item_id)
|
|
#
|
|
# return dict_item, 200
|
|
|
|
|
|
|
|
def api_get_item_content_base64_utf8(request_dict):
|
|
item_id = request_dict.get('id', None)
|
|
if not request_dict:
|
|
return {'status': 'error', 'reason': 'Malformed JSON'}, 400
|
|
if not item_id:
|
|
return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
|
|
if not exist_item(item_id):
|
|
return {'status': 'error', 'reason': 'Item not found'}, 404
|
|
|
|
item_content = get_item_content(item_id)
|
|
item_content = base64.b64encode((item_content.encode('utf-8'))).decode('UTF-8')
|
|
return {'status': 'success', 'content': item_content}, 200
|
|
|
|
|
|
def api_get_items_sources():
|
|
item_content = {'sources': get_all_sources()}
|
|
return item_content, 200
|
|
|
|
# def check_item_source(request_dict):
|
|
# source = request_dict.get('source', None)
|
|
# if not request_dict:
|
|
# return {'status': 'error', 'reason': 'Malformed JSON'}, 400
|
|
# if not source:
|
|
# return {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400
|
|
#
|
|
# all_sources = item_basic.get_all_items_sources()
|
|
#
|
|
# if source not in all_sources:
|
|
# return {'status': 'error', 'reason': 'Invalid source', 'provide': source}, 400
|
|
# return {'status': 'success', 'reason': 'Valid source', 'provide': source}, 200
|
|
|
|
|
|
###
|
|
### GET Internal Module DESC
|
|
###
|
|
def get_item_list_desc(list_item_id):
|
|
desc_list = []
|
|
for item_id in list_item_id:
|
|
item = Item(item_id)
|
|
desc_list.append( {'id': item_id, 'date': get_item_date(item_id), 'tags': item.get_tags(r_list=True)})
|
|
return desc_list
|
|
|
|
def is_crawled(item_id):
|
|
return item_basic.is_crawled(item_id)
|
|
|
|
def is_onion(item_id):
|
|
is_onion = False
|
|
if len(is_onion) > 62:
|
|
if is_crawled(item_id) and item_id[-42:-36] == '.onion':
|
|
is_onion = True
|
|
return is_onion
|
|
|
|
def is_item_in_domain(domain, item_id):
|
|
is_in_domain = False
|
|
domain_lenght = len(domain)
|
|
if len(item_id) > (domain_lenght+48):
|
|
if item_id[-36-domain_lenght:-36] == domain:
|
|
is_in_domain = True
|
|
return is_in_domain
|
|
|
|
def get_item_domain(item_id):
|
|
return item_basic.get_item_domain(item_id)
|
|
|
|
def get_domain(item_id):
|
|
item_id = item_id.split('/')
|
|
item_id = item_id[-1]
|
|
return item_id[:-36]
|
|
|
|
# TODO MOVE ME
|
|
def get_item_har_name(item_id):
|
|
har_path = os.path.join(har_directory, item_id) + '.json'
|
|
if os.path.isfile(har_path):
|
|
return har_path
|
|
else:
|
|
return None
|
|
|
|
def get_item_filename(item_id):
|
|
# Creating the full filepath
|
|
filename = os.path.join(ITEMS_FOLDER, item_id)
|
|
filename = os.path.realpath(filename)
|
|
|
|
# incorrect filename
|
|
if not os.path.commonprefix([filename, ITEMS_FOLDER]) == ITEMS_FOLDER:
|
|
return None
|
|
else:
|
|
return filename
|
|
|
|
def get_raw_content(item_id):
|
|
filepath = get_item_filepath(item_id)
|
|
with open(filepath, 'rb') as f:
|
|
file_content = BytesIO(f.read())
|
|
return file_content
|
|
|
|
def save_raw_content(item_id, io_content):
|
|
filepath = get_item_filename(item_id)
|
|
if os.path.isfile(filepath):
|
|
#print('File already exist')
|
|
return False
|
|
# create subdir
|
|
dirname = os.path.dirname(filepath)
|
|
if not os.path.exists(dirname):
|
|
os.makedirs(dirname)
|
|
# # TODO: check if is IO file
|
|
with open(filepath, 'wb') as f:
|
|
f.write(io_content.getvalue())
|
|
return True
|
|
|
|
# IDEA: send item to duplicate ?
|
|
def create_item(obj_id, obj_metadata, io_content):
|
|
'''
|
|
Create a new Item (Import or Test only).
|
|
|
|
:param obj_id: item id
|
|
:type obj_metadata: dict - 'first_seen', 'tags'
|
|
|
|
:return: is item created
|
|
:rtype: boolean
|
|
'''
|
|
# check if datetime match ??
|
|
|
|
|
|
# # TODO: validate obj_id
|
|
|
|
res = save_raw_content(obj_id, io_content)
|
|
# item saved
|
|
if res:
|
|
# creata tags
|
|
if 'tags' in obj_metadata:
|
|
item = Item(obj_id)
|
|
# # TODO: handle mixed tags: taxonomies and Galaxies
|
|
# for tag in obj_metadata['tags']:
|
|
# item.add_tag(tag)
|
|
return True
|
|
|
|
# Item not created
|
|
return False
|
|
|
|
# # check if item exists
|
|
# if not exist_item(obj_id):
|
|
# return False
|
|
# else:
|
|
# delete_item_duplicate(obj_id)
|
|
# # delete MISP event
|
|
# r_s_metadata.delete('misp_events:{}'.format(obj_id))
|
|
# r_s_metadata.delete('hive_cases:{}'.format(obj_id))
|
|
#
|
|
# os.remove(get_item_filename(obj_id))
|
|
#
|
|
# # get all correlation
|
|
# obj_correlations = get_item_all_correlation(obj_id)
|
|
# for correlation in obj_correlations:
|
|
# if correlation=='cryptocurrency' or correlation=='pgp':
|
|
# for obj2_subtype in obj_correlations[correlation]:
|
|
# for obj2_id in obj_correlations[correlation][obj2_subtype]:
|
|
# Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id,
|
|
# obj1_subtype=obj2_subtype)
|
|
# else:
|
|
# for obj2_id in obj_correlations[correlation]:
|
|
# Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id)
|
|
#
|
|
# # delete father/child
|
|
# delete_node(obj_id)
|
|
#
|
|
# # delete item metadata
|
|
#
|
|
# return True
|
|
#
|
|
# ### TODO in inport V2
|
|
# # delete from tracked items
|
|
#
|
|
# # # # TODO: # FIXME: LATER
|
|
# # delete from queue
|
|
# ###
|
|
# return False
|
|
|
|
#### ####
|
|
# def delete_node(item_id):
|
|
# if is_node(item_id):
|
|
# if is_crawled(item_id):
|
|
# delete_domain_node(item_id)
|
|
# item_basic._delete_node(item_id)
|
|
#
|
|
# def delete_domain_node(item_id):
|
|
# if is_domain_root(item_id):
|
|
# # remove from domain history
|
|
# domain, port = get_item_domain_with_port(item_id).split(':')
|
|
# domain_basic.delete_domain_item_core(item_id, domain, port)
|
|
# for child_id in get_all_domain_node_by_item_id(item_id):
|
|
# delete_item(child_id)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# content = 'test file content'
|
|
# duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
|
|
#
|
|
# item = Item('tests/2020/01/02/test_save.gz')
|
|
# item.create(content, _save=False)
|
|
filters = {'date_from': '20230101', 'date_to': '20230501', 'sources': ['crawled', 'submitted'], 'start': ':submitted/2023/04/28/submitted_2b3dd861-a75d-48e4-8cec-6108d41450da.gz'}
|
|
gen = get_all_items_objects(filters=filters)
|
|
for obj_id in gen:
|
|
print(obj_id.id)
|