AIL-framework/bin/lib/objects/Messages.py

328 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import re
import sys
import cld3
import html2text
from datetime import datetime
from pymisp import MISPObject
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ail_core import get_ail_uuid
from lib.objects.abstract_object import AbstractObject
from lib.ConfigLoader import ConfigLoader
from lib.objects import UsersAccount
from lib.data_retention_engine import update_obj_date, get_obj_date_first
# TODO Set all messages ???
from flask import url_for
config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
r_object = config_loader.get_db_conn("Kvrocks_Objects")
# r_content = config_loader.get_db_conn("Kvrocks_Content")
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
config_loader = None
# TODO SAVE OR EXTRACT MESSAGE SOURCE FOR ICON ?????????
# TODO iterate on all objects
# TODO also add support for small objects ????
# CAN Message exists without CHAT -> no convert it to object
# ID: source:chat_id:message_id ????
#
# /!\ handle null chat and message id -> chat = uuid and message = timestamp ???
# ID = <ChatInstance UUID>/<timestamp>/<chat ID>/<message ID> => telegram without channels
# ID = <ChatInstance UUID>/<timestamp>/<chat ID>/<Channel ID>/<message ID>
# ID = <ChatInstance UUID>/<timestamp>/<chat ID>/<Thread ID>/<message ID>
# ID = <ChatInstance UUID>/<timestamp>/<chat ID>/<Channel ID>/<Thread ID>/<message ID>
class Message(AbstractObject):
"""
AIL Message Object. (strings)
"""
def __init__(self, id): # TODO subtype or use source ????
super(Message, self).__init__('message', id) # message::< telegram/1692189934.380827/ChatID_MessageID >
def exists(self):
if self.subtype is None:
return r_object.exists(f'meta:{self.type}:{self.id}')
else:
return r_object.exists(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}')
def get_source(self):
"""
Returns source/feeder name
"""
l_source = self.id.split('/')[:-2]
return os.path.join(*l_source)
def get_basename(self):
return os.path.basename(self.id)
def get_content(self, r_type='str'): # TODO ADD cache # TODO Compress content ???????
"""
Returns content
"""
content = self._get_field('content')
if r_type == 'str':
return content
elif r_type == 'bytes':
return content.encode()
def get_date(self):
timestamp = self.get_timestamp()
return datetime.fromtimestamp(float(timestamp)).strftime('%Y%m%d')
def get_timestamp(self):
dirs = self.id.split('/')
return dirs[1]
def get_message_id(self): # TODO optimize
message_id = self.get_basename().rsplit('/', 1)[1]
# if message_id.endswith('.gz'):
# message_id = message_id[:-3]
return message_id
def get_chat_id(self): # TODO optimize -> use me to tag Chat
chat_id = self.get_basename().rsplit('_', 1)[0]
return chat_id
# TODO get Instance ID
# TODO get channel ID
# TODO get thread ID
def get_images(self):
images = []
for child in self.get_childrens():
obj_type, _, obj_id = child.split(':', 2)
if obj_type == 'image':
images.append(obj_id)
return images
def get_user_account(self, meta=False):
user_account = self.get_correlation('user-account')
if user_account.get('user-account'):
user_account = f'user-account:{user_account["user-account"].pop()}'
if meta:
_, user_account_subtype, user_account_id = user_account.split(':', 3)
user_account = UsersAccount.UserAccount(user_account_id, user_account_subtype).get_meta(options={'username', 'username_meta'})
return user_account
# Update value on import
# reply to -> parent ?
# reply/comment - > children ?
# nb views
# reactions
# nb fowards
# room ???
# message from channel ???
# message media
def get_translation(self): # TODO support multiple translated languages ?????
"""
Returns translated content
"""
return self._get_field('translated') # TODO multiples translation ... -> use set
def _set_translation(self, translation):
"""
Set translated content
"""
return self._set_field('translated', translation) # translation by hash ??? -> avoid translating multiple time
def get_html2text_content(self, content=None, ignore_links=False):
if not content:
content = self.get_content()
h = html2text.HTML2Text()
h.ignore_links = ignore_links
h.ignore_images = ignore_links
return h.handle(content)
# def get_ail_2_ail_payload(self):
# payload = {'raw': self.get_gzip_content(b64=True)}
# return payload
def get_link(self, flask_context=False):
if flask_context:
url = url_for('chats_explorer.objects_message', type=self.type, id=self.id)
else:
url = f'{baseurl}/objects/message?id={self.id}'
return url
def get_svg_icon(self):
return {'style': 'fas', 'icon': '\uf4ad', 'color': '#4dffff', 'radius': 5}
def get_misp_object(self): # TODO
obj = MISPObject('instant-message', standalone=True)
obj_date = self.get_date()
if obj_date:
obj.first_seen = obj_date
else:
self.logger.warning(
f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={obj_date}')
# obj_attrs = [obj.add_attribute('first-seen', value=obj_date),
# obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()),
# obj.add_attribute('sensor', value=get_ail_uuid())]
obj_attrs = []
for obj_attr in obj_attrs:
for tag in self.get_tags():
obj_attr.add_tag(tag)
return obj
# def get_url(self):
# return r_object.hget(f'meta:item::{self.id}', 'url')
# options: set of optional meta fields
def get_meta(self, options=None, timestamp=None):
"""
:type options: set
:type timestamp: float
"""
if options is None:
options = set()
meta = self.get_default_meta(tags=True)
# timestamp
if not timestamp:
timestamp = self.get_timestamp()
else:
timestamp = float(timestamp)
timestamp = datetime.fromtimestamp(float(timestamp))
meta['date'] = timestamp.strftime('%Y/%m/%d')
meta['hour'] = timestamp.strftime('%H:%M:%S')
meta['full_date'] = timestamp.isoformat(' ')
meta['source'] = self.get_source()
# optional meta fields
if 'content' in options:
meta['content'] = self.get_content()
if 'parent' in options:
meta['parent'] = self.get_parent()
if meta['parent'] and 'parent_meta' in options:
options.remove('parent')
parent_type, _, parent_id = meta['parent'].split(':', 3)
if parent_type == 'message':
message = Message(parent_id)
meta['reply_to'] = message.get_meta(options=options)
if 'investigations' in options:
meta['investigations'] = self.get_investigations()
if 'link' in options:
meta['link'] = self.get_link(flask_context=True)
if 'icon' in options:
meta['icon'] = self.get_svg_icon()
if 'user-account' in options:
meta['user-account'] = self.get_user_account(meta=True)
if not meta['user-account']:
meta['user-account'] = {'id': 'UNKNOWN'}
if 'chat' in options:
meta['chat'] = self.get_chat_id()
if 'images' in options:
meta['images'] = self.get_images()
# meta['encoding'] = None
return meta
def _languages_cleaner(self, content=None):
if not content:
content = self.get_content()
# REMOVE URLS
regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
url_regex = re.compile(regex)
urls = url_regex.findall(content)
urls = sorted(urls, key=len, reverse=True)
for url in urls:
content = content.replace(url, '')
# REMOVE PGP Blocks
regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
re.compile(regex_pgp_public_blocs)
re.compile(regex_pgp_signature)
re.compile(regex_pgp_message)
res = re.findall(regex_pgp_public_blocs, content)
for it in res:
content = content.replace(it, '')
res = re.findall(regex_pgp_signature, content)
for it in res:
content = content.replace(it, '')
res = re.findall(regex_pgp_message, content)
for it in res:
content = content.replace(it, '')
return content
def detect_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
languages = []
## CLEAN CONTENT ##
content = self.get_html2text_content(ignore_links=True)
content = self._languages_cleaner(content=content)
# REMOVE USELESS SPACE
content = ' '.join(content.split())
# - CLEAN CONTENT - #
if len(content) >= min_len:
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
languages.append(lang)
return languages
# def translate(self, content=None): # TODO translation plugin
# # TODO get text language
# if not content:
# content = self.get_content()
# translated = argostranslate.translate.translate(content, 'ru', 'en')
# # Save translation
# self._set_translation(translated)
# return translated
def create(self, content, translation=None, tags=[]):
self._set_field('content', content)
# r_content.get(f'content:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', content)
if translation:
self._set_translation(translation)
for tag in tags:
self.add_tag(tag)
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
def delete(self):
pass
def create_obj_id(chat_instance, chat_id, message_id, timestamp, channel_id=None, thread_id=None):
timestamp = int(timestamp)
if channel_id and thread_id:
return f'{chat_instance}/{timestamp}/{chat_id}/{chat_id}/{message_id}' # TODO add thread ID ?????
elif channel_id:
return f'{chat_instance}/{timestamp}/{channel_id}/{chat_id}/{message_id}'
elif thread_id:
return f'{chat_instance}/{timestamp}/{chat_id}/{thread_id}/{message_id}'
else:
return f'{chat_instance}/{timestamp}/{chat_id}/{message_id}'
# TODO Check if already exists
# def create(source, chat_id, message_id, timestamp, content, tags=[]):
def create(obj_id, content, translation=None, tags=[]):
message = Message(obj_id)
if not message.exists():
message.create(content, translation=translation, tags=tags)
return message
# TODO Encode translation
if __name__ == '__main__':
r = 'test'
print(r)