Fix #787 and add Unicode to ASCII function

Fix #787
- Uses regex to pick up the hostnames/domains from the "Received: from" headers.

Unicode to ASCII function
- Spam messages more often than not contain junk text as unicode characters in the headers. The "from" and "subject" headers being the most common ones. Before this change the script would error on such emails or sometimes replace the unicode characters with questionmarks "?".
- Function takes argument as an input and then encodes it in ascii while ignoring any malformed data. It then returns an ASCII string without the unicode characters.
- Currently implemented for "from" and "subject" handling.
pull/789/head
Sami Tainio 2021-09-28 14:50:17 +03:00 committed by GitHub
parent d44847b63a
commit 2fb354a938
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 23 additions and 12 deletions

View File

@ -251,6 +251,16 @@ class EMailObject(AbstractMISPObjectGenerator):
pass pass
return to_return return to_return
def unicode_to_ascii(self, arg):
"""
This function removes unicode characters and returns an ASCII string.
Spam messages commonly contain unicode encoded emojis which MISP cannot
handle. Those would either cause an error or show up as "?" in the UI.
"""
string_encode = arg.encode("ascii", "ignore")
string_decode = string_encode.decode()
return string_decode
def generate_attributes(self): def generate_attributes(self):
# Attach original & Converted # Attach original & Converted
@ -286,7 +296,8 @@ class EMailObject(AbstractMISPObjectGenerator):
self.__add_emails("to", message["Delivered-To"]) self.__add_emails("to", message["Delivered-To"])
if "From" in message: if "From" in message:
self.__add_emails("from", message["From"]) from_ascii = self.unicode_to_ascii(message["From"])
self.__add_emails("from", from_ascii)
if "Return-Path" in message: if "Return-Path" in message:
realname, address = email.utils.parseaddr(message["Return-Path"]) realname, address = email.utils.parseaddr(message["Return-Path"])
@ -299,7 +310,8 @@ class EMailObject(AbstractMISPObjectGenerator):
self.__add_emails("cc", message["Cc"]) self.__add_emails("cc", message["Cc"])
if "Subject" in message: if "Subject" in message:
self.add_attribute("subject", message["Subject"]) subject_ascii = self.unicode_to_ascii(message["Subject"])
self.add_attribute("subject", subject_ascii)
if "Message-ID" in message: if "Message-ID" in message:
self.add_attribute("message-id", message["Message-ID"]) self.add_attribute("message-id", message["Message-ID"])
@ -317,15 +329,6 @@ class EMailObject(AbstractMISPObjectGenerator):
if "Thread-Index" in message: if "Thread-Index" in message:
self.add_attribute("thread-index", message["Thread-Index"]) self.add_attribute("thread-index", message["Thread-Index"])
if "Received" in message:
try:
# We only want the hostnames
received_content = message['Received'].split(' ')
if received_content[0] == 'from':
self.add_attribute("received-header-hostname", received_content[1])
except Exception:
pass
self.__generate_received() self.__generate_received()
def __add_emails(self, typ: str, data: str, insert_display_names: bool = True): def __add_emails(self, typ: str, data: str, insert_display_names: bool = True):
@ -354,7 +357,7 @@ class EMailObject(AbstractMISPObjectGenerator):
def __generate_received(self): def __generate_received(self):
""" """
Extract IP addresses from received headers that are not private. Extract IP addresses from received headers that are not private. Also extract hostnames or domains.
""" """
received_items = self.email.get_all("received") received_items = self.email.get_all("received")
if received_items is None: if received_items is None:
@ -378,3 +381,11 @@ class EMailObject(AbstractMISPObjectGenerator):
continue # skip header if IP not found or is private continue # skip header if IP not found or is private
self.add_attribute("received-header-ip", value=str(ip), comment=fromstr) self.add_attribute("received-header-ip", value=str(ip), comment=fromstr)
# The hostnames and/or domains always come after the "Received: from"
# part so we can use regex to pick up those attributes.
received_from = re.findall(r'(?<=from\s)[\w\d\.\-]+\.\w{2,24}', str(received_items))
try:
[self.add_attribute("received-header-hostname", i) for i in received_from]
except Exception:
pass