Added support for malformed internationalized email headers

When an emails contains headers that use Unicode without properly crafing
them to comform to RFC-6323 the email import module would crash.
(See issue #119 & issue #93)

To address this I have added additional layers of encoding/decoding to
any possibly internationalized email headers. This decodes properly
formed and malformed UTF-8, UTF-16, and UTF-32 headers appropriately.
When an unknown encoding is encountered it is returned as an 'encoded-word'
per RFC2047.

This commit also adds unit-tests that tests properly formed and malformed
UTF-8, UTF-16, UTF-32, and CJK encoded strings in all header fields; UTF-8,
UTF-16, and UTF-32 encoded message bodies; and emoji testing for headers
and attachment file names.
pull/129/head
seamus tuohy 2017-07-02 17:07:09 -04:00
parent 3eecf9afe5
commit 40c71af637
2 changed files with 272 additions and 7 deletions

View File

@ -5,11 +5,14 @@ import json
import base64
import io
import zipfile
import codecs
import re
from email import message_from_bytes
from email.utils import parseaddr
from email.iterators import typed_subpart_iterator
from email.parser import Parser
from html.parser import HTMLParser
from email.header import decode_header
misperrors = {'error': 'Error'}
userConfig = {}
@ -38,7 +41,14 @@ def handler(q=False):
request = json.loads(q)
# request data is always base 64 byte encoded
data = base64.b64decode(request["data"])
message = message_from_bytes(data)
# Double decode to force headers to be re-parsed with proper encoding
message = Parser().parsestr(message_from_bytes(data).as_string())
# Decode any encoded headers to get at proper string
for key, val in message.items():
replacement = get_decoded_header(key, val)
if replacement is not None:
message.replace_header(key, replacement)
# Extract all header information
all_headers = ""
@ -340,6 +350,36 @@ def get_charset(message, default="ascii"):
return default
def get_decoded_header(header, value):
subject, encoding = decode_header(value)[0]
subject = subject.strip() # extra whitespace will mess up encoding
if isinstance(subject, bytes):
# Remove Byte Order Mark (BOM) from UTF strings
if encoding == 'utf-8':
return re.sub(codecs.BOM_UTF8, b"", subject).decode(encoding)
if encoding == 'utf-16':
return re.sub(codecs.BOM_UTF16, b"", subject).decode(encoding)
elif encoding == 'utf-32':
return re.sub(codecs.BOM_UTF32, b"", subject).decode(encoding)
# Try various UTF decodings for any unknown 8bit encodings
elif encoding == 'unknown-8bit':
for enc in [('utf-8', codecs.BOM_UTF8),
('utf-32', codecs.BOM_UTF32), # 32 before 16 so it raises errors
('utf-16', codecs.BOM_UTF16)]:
try:
return re.sub(enc[1], b"", subject).decode(enc[0])
except UnicodeDecodeError:
continue
# If none of those encoding work return it in RFC2047 format
return str(subject)
# Provide RFC2047 format string if encoding is a unknown encoding
# Better to have the analyst decode themselves than to provide a mangled string
elif encoding is None:
return str(subject)
else:
return subject.decode(encoding)
def introspection():
modulesetup = {}
try:

View File

@ -7,6 +7,7 @@ import base64
import json
import os
import io
import re
import zipfile
from hashlib import sha256
from email.mime.application import MIMEApplication
@ -316,7 +317,6 @@ class TestModules(unittest.TestCase):
query['data'] = decode_email(message)
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
# print(response.json())
values = [x["values"] for x in response.json()["results"]]
self.assertIn('EICAR.com', values)
for i in response.json()['results']:
@ -341,10 +341,12 @@ class TestModules(unittest.TestCase):
message.attach(MIMEText(text, 'html', encoding[0]))
query['data'] = decode_email(message)
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
response = requests.post(self.url + "query", data=data).json()
self.assertNotIn('error', response, response.get('error', ""))
self.assertIn('results', response, "No server results found.")
def test_email_header_encoding(self):
def test_email_header_proper_encoding(self):
query = {"module":"email_import"}
query["config"] = {"unzip_attachments": None,
"guess_zip_attachment_passwords": None,
@ -358,13 +360,236 @@ class TestModules(unittest.TestCase):
"""
message.attach(MIMEText(text, 'plain'))
for hdr, hdr_val in message.items():
# Encoding is used as the name of the file
msg = message
hdr_encoded = MIMEText(hdr_val.encode(encoding), 'plain', encoding)
msg[hdr] = Header(hdr_val, encoding)
encoded_header = hdr_val.encode(encoding)
msg.replace_header(hdr, Header(encoded_header, encoding))
query['data'] = decode_email(msg)
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
results = response.json()['results']
values = []
for x in results:
# Remove BOM from UTF-16 strings
if re.search('\ufeff', x["values"]):
values.append(re.sub('\ufeff', "", x["values"]))
else:
values.append(x["values"])
types = {}
for i in results:
types.setdefault(i["type"], 0)
types[i["type"]] += 1
# Check that all the items were correct
self.assertEqual(types['target-email'], 1)
self.assertIn('test@domain.com', values)
self.assertEqual(types['email-dst-display-name'], 4)
self.assertIn('Last One', values)
self.assertIn('Other Friend', values)
self.assertIn('Second Person', values)
self.assertIn('Testy Testerson', values)
self.assertEqual(types['email-dst'], 4)
self.assertIn('test@domain.com', values)
self.assertIn('second@domain.com', values)
self.assertIn('other@friend.net', values)
self.assertIn('last_one@finally.com', values)
self.assertEqual(types['email-src-display-name'], 2)
self.assertIn("Innocent Person", values)
self.assertEqual(types['email-src'], 2)
self.assertIn("evil_spoofer@example.com", values)
self.assertIn("IgnoreMeImInnocent@sender.com", values)
self.assertEqual(types['email-thread-index'], 1)
self.assertIn('AQHSR8Us3H3SoaY1oUy9AAwZfMF922bnA9GAgAAi9s4AAGvxAA==', values)
self.assertEqual(types['email-message-id'], 1)
self.assertIn("<4988EF2D.40804@example.com>", values)
self.assertEqual(types['email-subject'], 1)
self.assertIn("Example Message", values)
self.assertEqual(types['email-header'], 1)
self.assertEqual(types['email-x-mailer'], 1)
self.assertIn("mlx 5.1.7", values)
self.assertEqual(types['email-reply-to'], 1)
self.assertIn("<CI7DgL-A6dm92s7gf4-88g@E_0x238G4K2H08H9SDwsw8b6LwuA@mail.example.com>", values)
self.assertIn("<CI7DgL-A6dm92s7gf4-88g@E_0x238G4K2H08H9SDwsw8b6LwuA@mail.example.com>", values)
def test_email_header_malformed_encoding(self):
query = {"module":"email_import"}
query["config"] = {"unzip_attachments": None,
"guess_zip_attachment_passwords": None,
"extract_urls": None}
filenames = os.listdir("tests/test_files/encodings")
for encoding in ['utf-8', 'utf-16', 'utf-32']:
message = get_base_email()
text = """I am a test e-mail
the password is NOT "this string".
That is all.
"""
message.attach(MIMEText(text, 'plain'))
for hdr, hdr_val in message.items():
msg = message
encoded_header = hdr_val.encode(encoding)
pat = re.compile(hdr_val.encode())
message_bytes = pat.sub(encoded_header, msg.as_bytes())
message64 = base64.b64encode(message_bytes).decode()
query['data'] = message64
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
results = response.json()['results']
values = []
for x in results:
# Remove BOM from UTF-16 strings
if re.search('\ufeff', x["values"]):
values.append(re.sub('\ufeff', "", x["values"]))
else:
values.append(x["values"])
types = {}
for i in results:
types.setdefault(i["type"], 0)
types[i["type"]] += 1
# Check that all the items were correct
self.assertEqual(types['target-email'], 1)
self.assertIn('test@domain.com', values)
self.assertEqual(types['email-dst-display-name'], 4)
self.assertIn('Last One', values)
self.assertIn('Other Friend', values)
self.assertIn('Second Person', values)
self.assertIn('Testy Testerson', values)
self.assertEqual(types['email-dst'], 4)
self.assertIn('test@domain.com', values)
self.assertIn('second@domain.com', values)
self.assertIn('other@friend.net', values)
self.assertIn('last_one@finally.com', values)
self.assertEqual(types['email-src-display-name'], 2)
self.assertIn("Innocent Person", values)
self.assertEqual(types['email-src'], 2)
self.assertIn("evil_spoofer@example.com", values)
self.assertIn("IgnoreMeImInnocent@sender.com", values)
self.assertEqual(types['email-thread-index'], 1)
self.assertIn('AQHSR8Us3H3SoaY1oUy9AAwZfMF922bnA9GAgAAi9s4AAGvxAA==', values)
self.assertEqual(types['email-message-id'], 1)
self.assertIn("<4988EF2D.40804@example.com>", values)
self.assertEqual(types['email-subject'], 1)
self.assertIn("Example Message", values)
self.assertEqual(types['email-header'], 1)
self.assertEqual(types['email-x-mailer'], 1)
self.assertIn("mlx 5.1.7", values)
self.assertEqual(types['email-reply-to'], 1)
self.assertIn("<CI7DgL-A6dm92s7gf4-88g@E_0x238G4K2H08H9SDwsw8b6LwuA@mail.example.com>", values)
self.assertIn("<CI7DgL-A6dm92s7gf4-88g@E_0x238G4K2H08H9SDwsw8b6LwuA@mail.example.com>", values)
def test_email_header_CJK_encoding(self):
query = {"module":"email_import"}
query["config"] = {"unzip_attachments": None,
"guess_zip_attachment_passwords": None,
"extract_urls": None}
# filenames = os.listdir("tests/test_files/encodings")
# for encoding in ['utf-8', 'utf-16', 'utf-32']:
message = get_base_email()
text = """I am a test e-mail
the password is NOT "this string".
That is all.
"""
message.attach(MIMEText(text, 'plain'))
japanese_charset = "ビット及び8ビットの2バイト情報交換用符号化拡張漢字集合"
jisx213 = Header(japanese_charset, 'euc_jisx0213')
message.replace_header("Subject", jisx213)
query['data'] = decode_email(message)
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
# Parse Response
RFC_format = '=?euc_jisx0213?b?pdOlw6XItdqk0zil06XDpcikzjKl0KWkpci+8MrzuPK0uc3RyeS55rK9s8jEpbTBu/q9uLnn?='
for i in response.json()['results']:
if i['type'] == 'email-subject':
RFC_encoding_error = "The subject was not decoded from RFC2047 format."
self.assertNotEqual(RFC_format, i['values'], RFC_encoding_error)
self.assertEqual(japanese_charset, i['values'], "Subject not properly decoded")
def test_email_malformed_header_CJK_encoding(self):
query = {"module":"email_import"}
query["config"] = {"unzip_attachments": None,
"guess_zip_attachment_passwords": None,
"extract_urls": None}
# filenames = os.listdir("tests/test_files/encodings")
# for encoding in ['utf-8', 'utf-16', 'utf-32']:
message = get_base_email()
text = """I am a test e-mail
the password is NOT "this string".
That is all.
"""
message.attach(MIMEText(text, 'plain'))
japanese_charset = "ビット及び8ビットの2バイト情報交換用符号化拡張漢字集合"
japanese_bytes = japanese_charset.encode()
message.replace_header('Subject', "{{REPLACE}}")
pat = re.compile(b'{{REPLACE}}')
message_bytes = pat.sub(japanese_bytes, message.as_bytes())
message64 = base64.b64encode(message_bytes).decode()
query['data'] = message64
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
# Parse Response
RFC_format = '=?euc_jisx0213?b?pdOlw6XItdqk0zil06XDpcikzjKl0KWkpci+8MrzuPK0uc3RyeS55rK9s8jEpbTBu/q9uLnn?='
for i in response.json()['results']:
if i['type'] == 'email-subject':
RFC_encoding_error = "The subject was not decoded from RFC2047 format."
self.assertNotEqual(RFC_format, i['values'], RFC_encoding_error)
self.assertEqual(japanese_charset, i['values'], "Subject not properly decoded")
def test_email_malformed_header_emoji_encoding(self):
query = {"module":"email_import"}
query["config"] = {"unzip_attachments": None,
"guess_zip_attachment_passwords": None,
"extract_urls": None}
# filenames = os.listdir("tests/test_files/encodings")
# for encoding in ['utf-8', 'utf-16', 'utf-32']:
message = get_base_email()
text = """I am a test e-mail
the password is NOT "this string".
That is all.
"""
message.attach(MIMEText(text, 'plain'))
emoji_string = "Emoji Test 👍 checking this"
emoji_bytes = emoji_string.encode()
message.replace_header('Subject', "{{EMOJI}}")
pat = re.compile(b'{{EMOJI}}')
message_bytes = pat.sub(emoji_bytes, message.as_bytes())
message64 = base64.b64encode(message_bytes).decode()
query['data'] = message64
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
# Parse Response
RFC_format = "=?unknown-8bit?q?Emoji_Test_=F0=9F=91=8D_checking_this?="
for i in response.json()['results']:
if i['type'] == 'email-subject':
RFC_encoding_error = "The subject was not decoded from RFC2047 format."
self.assertNotEqual(RFC_format, i['values'], RFC_encoding_error)
self.assertEqual(emoji_string, i['values'], "Subject not properly decoded")
def test_email_attachment_emoji_filename(self):
query = {"module": "email_import"}
query["config"] = {"unzip_attachments": None,
"guess_zip_attachment_passwords": None,
"extract_urls": None}
message = get_base_email()
text = """I am a test e-mail"""
message.attach(MIMEText(text, 'plain'))
with open("tests/EICAR.com", "rb") as fp:
eicar_mime = MIMEApplication(fp.read(), 'com')
eicar_mime.add_header('Content-Disposition',
'attachment',
filename="Emoji Test 👍 checking this")
message.attach(eicar_mime)
query['data'] = decode_email(message)
data = json.dumps(query)
response = requests.post(self.url + "query", data=data)
values = [x["values"] for x in response.json()['results']]
self.assertIn("Emoji Test 👍 checking this", values)
for i in response.json()['results']:
if i["type"] == 'email-attachment':
self.assertEqual(i["values"], "Emoji Test 👍 checking this")
if i['type'] == 'malware-sample':
attch_data = base64.b64decode(i["data"])
self.assertEqual(attch_data, b'X5O!P%@AP[4\\PZX54(P^)7CC)7}$EICAR-STANDARD-ANTIVIRUS-TEST-')
def test_email_attachment_password_in_subject(self):
query = {"module": "email_import"}