mirror of https://github.com/MISP/PyMISP
Fix #787 and add Unicode to ASCII function
Fix #787 - Uses regex to pick up the hostnames/domains from the "Received: from" headers. Unicode to ASCII function - Spam messages more often than not contain junk text as unicode characters in the headers. The "from" and "subject" headers being the most common ones. Before this change the script would error on such emails or sometimes replace the unicode characters with questionmarks "?". - Function takes argument as an input and then encodes it in ascii while ignoring any malformed data. It then returns an ASCII string without the unicode characters. - Currently implemented for "from" and "subject" handling.pull/789/head
parent
d44847b63a
commit
2fb354a938
|
@ -251,6 +251,16 @@ class EMailObject(AbstractMISPObjectGenerator):
|
||||||
pass
|
pass
|
||||||
return to_return
|
return to_return
|
||||||
|
|
||||||
|
def unicode_to_ascii(self, arg):
|
||||||
|
"""
|
||||||
|
This function removes unicode characters and returns an ASCII string.
|
||||||
|
Spam messages commonly contain unicode encoded emojis which MISP cannot
|
||||||
|
handle. Those would either cause an error or show up as "?" in the UI.
|
||||||
|
"""
|
||||||
|
string_encode = arg.encode("ascii", "ignore")
|
||||||
|
string_decode = string_encode.decode()
|
||||||
|
return string_decode
|
||||||
|
|
||||||
def generate_attributes(self):
|
def generate_attributes(self):
|
||||||
|
|
||||||
# Attach original & Converted
|
# Attach original & Converted
|
||||||
|
@ -286,7 +296,8 @@ class EMailObject(AbstractMISPObjectGenerator):
|
||||||
self.__add_emails("to", message["Delivered-To"])
|
self.__add_emails("to", message["Delivered-To"])
|
||||||
|
|
||||||
if "From" in message:
|
if "From" in message:
|
||||||
self.__add_emails("from", message["From"])
|
from_ascii = self.unicode_to_ascii(message["From"])
|
||||||
|
self.__add_emails("from", from_ascii)
|
||||||
|
|
||||||
if "Return-Path" in message:
|
if "Return-Path" in message:
|
||||||
realname, address = email.utils.parseaddr(message["Return-Path"])
|
realname, address = email.utils.parseaddr(message["Return-Path"])
|
||||||
|
@ -299,7 +310,8 @@ class EMailObject(AbstractMISPObjectGenerator):
|
||||||
self.__add_emails("cc", message["Cc"])
|
self.__add_emails("cc", message["Cc"])
|
||||||
|
|
||||||
if "Subject" in message:
|
if "Subject" in message:
|
||||||
self.add_attribute("subject", message["Subject"])
|
subject_ascii = self.unicode_to_ascii(message["Subject"])
|
||||||
|
self.add_attribute("subject", subject_ascii)
|
||||||
|
|
||||||
if "Message-ID" in message:
|
if "Message-ID" in message:
|
||||||
self.add_attribute("message-id", message["Message-ID"])
|
self.add_attribute("message-id", message["Message-ID"])
|
||||||
|
@ -317,15 +329,6 @@ class EMailObject(AbstractMISPObjectGenerator):
|
||||||
if "Thread-Index" in message:
|
if "Thread-Index" in message:
|
||||||
self.add_attribute("thread-index", message["Thread-Index"])
|
self.add_attribute("thread-index", message["Thread-Index"])
|
||||||
|
|
||||||
if "Received" in message:
|
|
||||||
try:
|
|
||||||
# We only want the hostnames
|
|
||||||
received_content = message['Received'].split(' ')
|
|
||||||
if received_content[0] == 'from':
|
|
||||||
self.add_attribute("received-header-hostname", received_content[1])
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
self.__generate_received()
|
self.__generate_received()
|
||||||
|
|
||||||
def __add_emails(self, typ: str, data: str, insert_display_names: bool = True):
|
def __add_emails(self, typ: str, data: str, insert_display_names: bool = True):
|
||||||
|
@ -354,7 +357,7 @@ class EMailObject(AbstractMISPObjectGenerator):
|
||||||
|
|
||||||
def __generate_received(self):
|
def __generate_received(self):
|
||||||
"""
|
"""
|
||||||
Extract IP addresses from received headers that are not private.
|
Extract IP addresses from received headers that are not private. Also extract hostnames or domains.
|
||||||
"""
|
"""
|
||||||
received_items = self.email.get_all("received")
|
received_items = self.email.get_all("received")
|
||||||
if received_items is None:
|
if received_items is None:
|
||||||
|
@ -378,3 +381,11 @@ class EMailObject(AbstractMISPObjectGenerator):
|
||||||
continue # skip header if IP not found or is private
|
continue # skip header if IP not found or is private
|
||||||
|
|
||||||
self.add_attribute("received-header-ip", value=str(ip), comment=fromstr)
|
self.add_attribute("received-header-ip", value=str(ip), comment=fromstr)
|
||||||
|
|
||||||
|
# The hostnames and/or domains always come after the "Received: from"
|
||||||
|
# part so we can use regex to pick up those attributes.
|
||||||
|
received_from = re.findall(r'(?<=from\s)[\w\d\.\-]+\.\w{2,24}', str(received_items))
|
||||||
|
try:
|
||||||
|
[self.add_attribute("received-header-hostname", i) for i in received_from]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
Loading…
Reference in New Issue