fix: [cld3 python3.10] temp disable cld3

pull/586/head
Terrtia 2022-06-21 14:41:36 +02:00
parent 3b333826e5
commit 4d39b2c813
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
3 changed files with 70 additions and 44 deletions

View File

@ -3,7 +3,6 @@
import os import os
import sys import sys
import cld3
import time import time
from packages import Item from packages import Item

View File

@ -6,30 +6,38 @@ import os
import re import re
import sys import sys
import redis import redis
import cld3 # import cld3
import html2text import html2text
from io import BytesIO from io import BytesIO
from pymisp import MISPObject
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from export.Export import get_ail_uuid # # TODO: REPLACE
from lib.objects.abstract_object import AbstractObject
from lib.ConfigLoader import ConfigLoader
from lib import item_basic
from lib import domain_basic
from packages import Tag
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
import Tag
import Cryptocurrency import Cryptocurrency
import Pgp import Pgp
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import item_basic
import domain_basic
import ConfigLoader
import Correlate_object import Correlate_object
import Decoded import Decoded
import Screenshot import Screenshot
import Username import Username
from abstract_object import AbstractObject
from item_basic import *
from flask import url_for from flask import url_for
config_loader = ConfigLoader.ConfigLoader() config_loader = ConfigLoader()
# get and sanityze PASTE DIRECTORY # get and sanityze PASTE DIRECTORY
# # TODO: rename PASTES_FOLDER # # TODO: rename PASTES_FOLDER
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/' PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
@ -89,6 +97,12 @@ class Item(AbstractObject):
""" """
return item_basic.get_item_content(self.id) return item_basic.get_item_content(self.id)
def get_raw_content(self):
filepath = self.get_filename()
with open(filepath, 'rb') as f:
raw_content = BytesIO(f.read())
return raw_content
def get_gzip_content(self, b64=False): def get_gzip_content(self, b64=False):
with open(self.get_filename(), 'rb') as f: with open(self.get_filename(), 'rb') as f:
content = f.read() content = f.read()
@ -97,8 +111,7 @@ class Item(AbstractObject):
return content.decode() return content.decode()
def get_ail_2_ail_payload(self): def get_ail_2_ail_payload(self):
payload = {'raw': self.get_gzip_content(b64=True), payload = {'raw': self.get_gzip_content(b64=True)}
'compress': 'gzip'}
return payload return payload
# # TODO: # # TODO:
@ -108,6 +121,7 @@ class Item(AbstractObject):
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
# TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ... # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
def delete(self): def delete(self):
self._delete()
try: try:
os.remove(self.get_filename()) os.remove(self.get_filename())
return True return True
@ -128,9 +142,19 @@ class Item(AbstractObject):
color = '#332288' color = '#332288'
return {'style': '', 'icon': '', 'color': color, 'radius':5} return {'style': '', 'icon': '', 'color': color, 'radius':5}
############################################################################ def get_misp_object(self):
############################################################################ obj_date = self.get_date()
############################################################################ obj = MISPObject('ail-leak', standalone=True)
obj.first_seen = obj_date
obj_attrs = []
obj_attrs.append( obj.add_attribute('first-seen', value=obj_date) )
obj_attrs.append( obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()) )
obj_attrs.append( obj.add_attribute('sensor', value=get_ail_uuid()) )
for obj_attr in obj_attrs:
for tag in self.get_tags():
obj_attr.add_tag(tag)
return obj
def exist_correlation(self): def exist_correlation(self):
pass pass
@ -249,20 +273,20 @@ def remove_all_urls_from_content(item_id, item_content=None):
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
all_languages = [] all_languages = []
## CLEAN CONTENT ## # ## CLEAN CONTENT ##
content = get_item_content_html2text(item_id, ignore_links=True) # content = get_item_content_html2text(item_id, ignore_links=True)
content = remove_all_urls_from_content(item_id, item_content=content) # content = remove_all_urls_from_content(item_id, item_content=content)
#
# REMOVE USELESS SPACE # # REMOVE USELESS SPACE
content = ' '.join(content.split()) # content = ' '.join(content.split())
#- CLEAN CONTENT -# # #- CLEAN CONTENT -#
#
#print(content) # #print(content)
#print(len(content)) # #print(len(content))
if len(content) >= min_len: # if len(content) >= min_len:
for lang in cld3.get_frequent_languages(content, num_langs=num_langs): # for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: # if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
all_languages.append(lang) # all_languages.append(lang)
return all_languages return all_languages
# API # API
@ -688,4 +712,7 @@ def delete_domain_node(item_id):
delete_item(child_id) delete_item(child_id)
#if __name__ == '__main__': # if __name__ == '__main__':
#
# item = Item('')
# print(item.get_misp_object().to_json())

View File

@ -6,7 +6,7 @@ import os
import re import re
import sys import sys
import redis import redis
import cld3 # import cld3
import html2text import html2text
from io import BytesIO from io import BytesIO
@ -148,20 +148,20 @@ def remove_all_urls_from_content(item_id, item_content=None):
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
all_languages = [] all_languages = []
## CLEAN CONTENT ## # ## CLEAN CONTENT ##
content = get_item_content_html2text(item_id, ignore_links=True) # content = get_item_content_html2text(item_id, ignore_links=True)
content = remove_all_urls_from_content(item_id, item_content=content) # content = remove_all_urls_from_content(item_id, item_content=content)
#
# REMOVE USELESS SPACE # # REMOVE USELESS SPACE
content = ' '.join(content.split()) # content = ' '.join(content.split())
#- CLEAN CONTENT -# # #- CLEAN CONTENT -#
#
#print(content) # #print(content)
#print(len(content)) # #print(len(content))
if len(content) >= min_len: # if len(content) >= min_len:
for lang in cld3.get_frequent_languages(content, num_langs=num_langs): # for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable: # if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
all_languages.append(lang) # all_languages.append(lang)
return all_languages return all_languages
# API # API