new: Refactored emailobject generator

pull/631/head
Jakub Onderka 2020-09-29 14:27:05 +02:00
parent d39d4caf7d
commit f598865ce4
3 changed files with 250 additions and 92 deletions

75
poetry.lock generated
View File

@ -284,6 +284,14 @@ zipp = ">=0.5"
docs = ["sphinx", "rst.linker"]
testing = ["packaging", "pep517", "importlib-resources (>=1.3)"]
[[package]]
name = "ipaddress"
version = "1.0.23"
description = "IPv4/IPv6 manipulation library"
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "ipykernel"
version = "5.3.4"
@ -484,6 +492,19 @@ category = "main"
optional = true
python-versions = ">=2.7"
[[package]]
name = "mail-parser"
version = "3.12.0"
description = "Wrapper for email standard library"
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
ipaddress = "1.0.23"
simplejson = "3.17.0"
six = "1.14.0"
[[package]]
name = "markupsafe"
version = "1.1.1"
@ -913,9 +934,17 @@ category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "simplejson"
version = "3.17.0"
description = "Simple, fast, extensible JSON encoder/decoder for Python"
category = "main"
optional = false
python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "six"
version = "1.15.0"
version = "1.14.0"
description = "Python 2 and 3 compatibility utilities"
category = "main"
optional = false
@ -1193,7 +1222,7 @@ virustotal = ["validators"]
[metadata]
lock-version = "1.1"
python-versions = "^3.6"
content-hash = "a2bf3a2d2162cc76563904258ac8b667801f14c3f3ff9df310b4d5c23d4e13d9"
content-hash = "23aa8f0499f0012761ac2f91c02c2ad02a3a4fb53dd57bdcdca5db2b32b54634"
[metadata.files]
alabaster = [
@ -1384,6 +1413,10 @@ importlib-metadata = [
{file = "importlib_metadata-2.0.0-py2.py3-none-any.whl", hash = "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3"},
{file = "importlib_metadata-2.0.0.tar.gz", hash = "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da"},
]
ipaddress = [
{file = "ipaddress-1.0.23-py2.py3-none-any.whl", hash = "sha256:6e0f4a39e66cb5bb9a137b00276a2eff74f93b71dcbdad6f10ff7df9d3557fcc"},
{file = "ipaddress-1.0.23.tar.gz", hash = "sha256:b7f8e0369580bb4a24d5ba1d7cc29660a4a6987763faf1d8a8046830e020e7e2"},
]
ipykernel = [
{file = "ipykernel-5.3.4-py3-none-any.whl", hash = "sha256:d6fbba26dba3cebd411382bc484f7bc2caa98427ae0ddb4ab37fe8bfeb5c7dd3"},
{file = "ipykernel-5.3.4.tar.gz", hash = "sha256:9b2652af1607986a1b231c62302d070bc0534f564c393a5d9d130db9abbbe89d"},
@ -1448,6 +1481,10 @@ lief = [
{file = "lief-0.10.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:276cc63ec12a21bdf01b8d30962692c17499788234f0765247ca7a35872097ec"},
{file = "lief-0.10.1.tar.gz", hash = "sha256:a487fe7234c04bccd58223dbb79214421176e2629814c7a4a887764cceb5be7c"},
]
mail-parser = [
{file = "mail-parser-3.12.0.tar.gz", hash = "sha256:e8ff4ac4b27d4a0a87fe69cdaca9a9123f9662b28991b3b838e449a779345214"},
{file = "mail_parser-3.12.0-py3-none-any.whl", hash = "sha256:b948e2905ae1f8823b2b2b3acaca8595d959cf73ca89e2bc86220b895f7af4d2"},
]
markupsafe = [
{file = "MarkupSafe-1.1.1-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161"},
{file = "MarkupSafe-1.1.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"},
@ -1749,9 +1786,39 @@ send2trash = [
{file = "Send2Trash-1.5.0-py3-none-any.whl", hash = "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"},
{file = "Send2Trash-1.5.0.tar.gz", hash = "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2"},
]
simplejson = [
{file = "simplejson-3.17.0-cp27-cp27m-macosx_10_13_x86_64.whl", hash = "sha256:87d349517b572964350cc1adc5a31b493bbcee284505e81637d0174b2758ba17"},
{file = "simplejson-3.17.0-cp27-cp27m-win32.whl", hash = "sha256:1d1e929cdd15151f3c0b2efe953b3281b2fd5ad5f234f77aca725f28486466f6"},
{file = "simplejson-3.17.0-cp27-cp27m-win_amd64.whl", hash = "sha256:1ea59f570b9d4916ae5540a9181f9c978e16863383738b69a70363bc5e63c4cb"},
{file = "simplejson-3.17.0-cp33-cp33m-win32.whl", hash = "sha256:8027bd5f1e633eb61b8239994e6fc3aba0346e76294beac22a892eb8faa92ba1"},
{file = "simplejson-3.17.0-cp33-cp33m-win_amd64.whl", hash = "sha256:22a7acb81968a7c64eba7526af2cf566e7e2ded1cb5c83f0906b17ff1540f866"},
{file = "simplejson-3.17.0-cp34-cp34m-win32.whl", hash = "sha256:17163e643dbf125bb552de17c826b0161c68c970335d270e174363d19e7ea882"},
{file = "simplejson-3.17.0-cp34-cp34m-win_amd64.whl", hash = "sha256:0fe3994207485efb63d8f10a833ff31236ed27e3b23dadd0bf51c9900313f8f2"},
{file = "simplejson-3.17.0-cp35-cp35m-win32.whl", hash = "sha256:4cf91aab51b02b3327c9d51897960c554f00891f9b31abd8a2f50fd4a0071ce8"},
{file = "simplejson-3.17.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fc9051d249dd5512e541f20330a74592f7a65b2d62e18122ca89bf71f94db748"},
{file = "simplejson-3.17.0-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:86afc5b5cbd42d706efd33f280fec7bd7e2772ef54e3f34cf6b30777cd19a614"},
{file = "simplejson-3.17.0-cp36-cp36m-win32.whl", hash = "sha256:926bcbef9eb60e798eabda9cd0bbcb0fca70d2779aa0aa56845749d973eb7ad5"},
{file = "simplejson-3.17.0-cp36-cp36m-win_amd64.whl", hash = "sha256:daaf4d11db982791be74b23ff4729af2c7da79316de0bebf880fa2d60bcc8c5a"},
{file = "simplejson-3.17.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:9a126c3a91df5b1403e965ba63b304a50b53d8efc908a8c71545ed72535374a3"},
{file = "simplejson-3.17.0-cp37-cp37m-win32.whl", hash = "sha256:fc046afda0ed8f5295212068266c92991ab1f4a50c6a7144b69364bdee4a0159"},
{file = "simplejson-3.17.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7cce4bac7e0d66f3a080b80212c2238e063211fe327f98d764c6acbc214497fc"},
{file = "simplejson-3.17.0.tar.gz", hash = "sha256:2b4b2b738b3b99819a17feaf118265d0753d5536049ea570b3c43b51c4701e81"},
{file = "simplejson-3.17.0.win-amd64-py2.7.exe", hash = "sha256:1d346c2c1d7dd79c118f0cc7ec5a1c4127e0c8ffc83e7b13fc5709ff78c9bb84"},
{file = "simplejson-3.17.0.win-amd64-py3.3.exe", hash = "sha256:5cfd495527f8b85ce21db806567de52d98f5078a8e9427b18e251c68bd573a26"},
{file = "simplejson-3.17.0.win-amd64-py3.4.exe", hash = "sha256:8de378d589eccbc75941e480b4d5b4db66f22e4232f87543b136b1f093fff342"},
{file = "simplejson-3.17.0.win-amd64-py3.5.exe", hash = "sha256:f4b64a1031acf33e281fd9052336d6dad4d35eee3404c95431c8c6bc7a9c0588"},
{file = "simplejson-3.17.0.win-amd64-py3.6.exe", hash = "sha256:ad8dd3454d0c65c0f92945ac86f7b9efb67fa2040ba1b0189540e984df904378"},
{file = "simplejson-3.17.0.win-amd64-py3.7.exe", hash = "sha256:229edb079d5dd81bf12da952d4d825bd68d1241381b37d3acf961b384c9934de"},
{file = "simplejson-3.17.0.win32-py2.7.exe", hash = "sha256:4fd5f79590694ebff8dc980708e1c182d41ce1fda599a12189f0ca96bf41ad70"},
{file = "simplejson-3.17.0.win32-py3.3.exe", hash = "sha256:d140e9376e7f73c1f9e0a8e3836caf5eec57bbafd99259d56979da05a6356388"},
{file = "simplejson-3.17.0.win32-py3.4.exe", hash = "sha256:da00675e5e483ead345429d4f1374ab8b949fba4429d60e71ee9d030ced64037"},
{file = "simplejson-3.17.0.win32-py3.5.exe", hash = "sha256:7739940d68b200877a15a5ff5149e1599737d6dd55e302625650629350466418"},
{file = "simplejson-3.17.0.win32-py3.6.exe", hash = "sha256:60aad424e47c5803276e332b2a861ed7a0d46560e8af53790c4c4fb3420c26c2"},
{file = "simplejson-3.17.0.win32-py3.7.exe", hash = "sha256:1fbba86098bbfc1f85c5b69dc9a6d009055104354e0d9880bb00b692e30e0078"},
]
six = [
{file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
{file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
{file = "six-1.14.0-py2.py3-none-any.whl", hash = "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"},
{file = "six-1.14.0.tar.gz", hash = "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a"},
]
snowballstemmer = [
{file = "snowballstemmer-2.0.0-py2.py3-none-any.whl", hash = "sha256:209f257d7533fdb3cb73bdbd24f436239ca3b2fa67d56f6ff88e86be08cc5ef0"},

View File

@ -1,121 +1,211 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from email import policy
from email.message import EmailMessage
from io import BytesIO
from pathlib import Path
from typing import Union, List, Tuple
import email.utils
import ipaddress
import logging
import mailparser # type: ignore
from mailparser.utils import msgconvert # type: ignore
from ..exceptions import InvalidMISPObject
from .abstractgenerator import AbstractMISPObjectGenerator
from io import BytesIO
import logging
from email import message_from_bytes, policy
import email.utils
from pathlib import Path
from typing import Union
try:
import magic # type: ignore
import tempfile
except ImportError:
magic = None
logger = logging.getLogger('pymisp')
class EMailObject(AbstractMISPObjectGenerator):
def __init__(self, filepath: Union[Path, str] = None, pseudofile: BytesIO = None,
attach_original_email: bool = True, **kwargs):
super().__init__("email", **kwargs)
def __init__(self, filepath: Union[Path, str] = None, pseudofile: BytesIO = None, attach_original_email: bool = True, **kwargs):
# PY3 way:
# super().__init__('file')
super(EMailObject, self).__init__('email', **kwargs)
converted = False
if filepath:
with open(filepath, 'rb') as f:
self.__pseudofile = BytesIO(f.read())
if str(filepath).endswith(".msg"):
pseudofile = self.__convert_outlook_msg_format(str(filepath))
converted = True
else:
with open(filepath, "rb") as f:
pseudofile = BytesIO(f.read())
elif pseudofile and isinstance(pseudofile, BytesIO):
self.__pseudofile = pseudofile
if magic:
# if python-magic is installed, we can autodetect MS Outlook format
mime = magic.from_buffer(pseudofile.read(2048), mime=True)
pseudofile.seek(0)
if mime == "application/CDFV2":
# save outlook msg file to temporary file
temph, temp = tempfile.mkstemp(prefix="outlook_")
with os.fdopen(temph, "wb") as fdfile:
fdfile.write(pseudofile.getvalue())
fdfile.close()
pseudofile = self.__convert_outlook_msg_format(temp)
os.unlink(temp) # remove temporary file necessary to convert formats
converted = True
else:
raise InvalidMISPObject('File buffer (BytesIO) or a path is required.')
self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default)
# Improperly encoded emails (utf-8-sig) fail silently. An empty email indicates this might be the case.
if len(self.__email) == 0:
self.attempt_decoding()
raise InvalidMISPObject("File buffer (BytesIO) or a path is required.")
if attach_original_email:
self.add_attribute('eml', value='Full email.eml', data=self.__pseudofile)
self.generate_attributes()
self.add_attribute("eml", value="Full email.eml", data=pseudofile,
comment="Converted from MSG format" if converted else None)
message = self.attempt_decoding(pseudofile)
self.__parser = mailparser.MailParser(message)
self.__generate_attributes()
@staticmethod
def __convert_outlook_msg_format(filepath: str) -> BytesIO:
converted_file, _ = msgconvert(filepath)
with open(converted_file, "rb") as f:
pseudofile = BytesIO(f.read())
os.remove(converted_file) # delete temporary file
return pseudofile
@staticmethod
def attempt_decoding(bytes_io: BytesIO) -> EmailMessage:
"""Attempt to decode different king of emails, for example non-ascii encoded emails."""
bytes = bytes_io.getvalue()
message: EmailMessage = email.message_from_bytes(bytes, policy=policy.default) # type: ignore
if len(message) != 0:
return message
# Improperly encoded emails (utf-8-sig) fail silently. An empty email indicates this might be the case.
try:
bytes.decode("ASCII")
raise Exception("EmailObject failed to decode ASCII encoded email.")
except UnicodeDecodeError:
logger.debug("EmailObject was passed a non-ASCII encoded binary blob.")
try:
if bytes[:3] == b'\xef\xbb\xbf': # utf-8-sig byte-order mark (BOM)
# Set Pseudofile to correctly encoded email in case it is used at some later point.
bytes = bytes.decode("utf_8_sig").encode("ASCII")
message = email.message_from_bytes(bytes, policy=policy.default) # type: ignore
return message
except UnicodeDecodeError:
pass
raise Exception(
"EmailObject does not know how to decode binary blob passed to it. Object may not be an email. If this is an email please submit it as an issue to PyMISP so we can add support.")
@property
def email(self):
return self.__email
def email(self) -> EmailMessage:
return self.__parser.message
@property
def attachments(self):
def attachments(self) -> List[Tuple[str, BytesIO]]:
to_return = []
for attachment in self.__email.iter_attachments():
content = attachment.get_content()
for attachment in self.email.iter_attachments():
content = attachment.get_content() # type: ignore
if isinstance(content, str):
content = content.encode()
to_return.append((attachment.get_filename(), BytesIO(content)))
return to_return
def attempt_decoding(self):
"""Attempt to decode non-ascii encoded emails.
"""
_msg_bytes = self.__pseudofile.getvalue()
try:
_msg_bytes.decode("ASCII")
logger.info("EmailObject failed to decode ASCII encoded email.")
return
except UnicodeDecodeError:
logger.debug("EmailObject was passed a non-ASCII encoded binary blob.")
try:
if _msg_bytes[:3] == b'\xef\xbb\xbf': # utf-8-sig byte-order mark (BOM)
# Set Pseudofile to correctly encoded email in case it is used at some later point.
self.__pseudofile = BytesIO(_msg_bytes.decode('utf_8_sig').encode("ASCII"))
self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default)
except UnicodeDecodeError:
logger.debug("EmailObject does not know how to decode binary blob passed to it. Object may not be an email. If this is an email please submit it as an issue to PyMISP so we can add support.")
def __generate_attributes(self):
message = self.email
def generate_attributes(self):
if self.__email.get_body(preferencelist=('html', 'plain')):
self.add_attribute('email-body', value=self.__email.get_body(preferencelist=('html', 'plain')).get_payload(decode=True).decode('utf8', 'surrogateescape'))
if 'Reply-To' in self.__email:
self.add_attribute('reply-to', value=self.__email['Reply-To'])
if 'Message-ID' in self.__email:
self.add_attribute('message-id', value=self.__email['Message-ID'])
if 'To' in self.__email:
self._add_emails('to', self.__email['To'])
if 'Cc' in self.__email:
# TODO: split name and email address
to_add = [to.strip() for to in self.__email['Cc'].split(',')]
self.add_attributes('cc', *to_add)
if 'Subject' in self.__email:
self.add_attribute('subject', value=self.__email['Subject'])
if 'From' in self.__email:
self._add_emails('from', self.__email['From'])
if 'Return-Path' in self.__email:
# TODO: split name and email address
self.add_attribute('return-path', value=self.__email['Return-Path'])
if 'User-Agent' in self.__email:
self.add_attribute('user-agent', value=self.__email['User-Agent'])
if self.__email.get_boundary():
self.add_attribute('mime-boundary', value=self.__email.get_boundary())
if 'X-Mailer' in self.__email:
self.add_attribute('x-mailer', value=self.__email['X-Mailer'])
if 'Thread-Index' in self.__email:
self.add_attribute('thread-index', value=self.__email['Thread-Index'])
if 'Date' in self.__email:
self.add_attribute('send-date', value=self._sanitize_timestamp(self.__email['Date']))
# TODO: email-header: all headers in one bloc
# TODO: BCC?
# TODO: received headers sometimes have TO email addresses
body = message.get_body(preferencelist=("html", "plain"))
if body:
self.add_attribute("email-body", body.get_payload(decode=True).decode('utf8', 'surrogateescape'))
def _add_emails(self, type: str, data: str):
parts = [part.strip() for part in data.split(',')]
headers = ["{}: {}".format(k, v) for k, v in message.items()]
if headers:
self.add_attribute("header", "\n".join(headers))
message_date = self.__parser.date
if message_date:
self.add_attribute("send-date", message_date)
if "To" in message:
self.__add_emails("to", message["To"])
if "From" in message:
self.__add_emails("from", message["From"])
if "Return-Path" in message:
realname, address = email.utils.parseaddr(message["Return-Path"])
self.add_attribute("return-path", address)
if "Reply-To" in message:
realname, address = self.__parser.reply_to[0]
if address and realname:
self.add_attribute("reply-to", value=address, comment=message["Reply-To"])
elif address:
self.add_attribute("reply-to", address)
else: # invalid format, insert original value
self.add_attribute("reply-to", message["Reply-To"])
if "Cc" in message:
self.__add_emails("cc", message["Cc"], insert_display_names=False)
if "Subject" in message:
self.add_attribute("subject", message["Subject"])
if "Message-ID" in message:
self.add_attribute("message-id", message["Message-ID"])
if "User-Agent" in message:
self.add_attribute("user-agent", message["User-Agent"])
boundary = message.get_boundary()
if boundary:
self.add_attribute("mime-boundary", boundary)
if "X-Mailer" in message:
self.add_attribute("x-mailer", message["X-Mailer"])
if "Thread-Index" in message:
self.add_attribute("thread-index", message["Thread-Index"])
self.__generate_received()
def __add_emails(self, typ: str, data: str, insert_display_names: bool = True):
parts = [part.strip() for part in data.split(",")]
addresses = []
display_names = []
for part in parts:
realname, address = email.utils.parseaddr(part)
if address: # parsing failed, insert original value
addresses.push({"value": part})
else:
addresses.push({"value": address, "comment": part})
if address and realname:
addresses.append({"value": address, "comment": part})
elif address:
addresses.append({"value": address})
else: # parsing failed, insert original value
addresses.append({"value": part})
if realname:
display_names.push({"value": realname, "comment": part})
display_names.append({"value": realname, "comment": part})
if addresses:
self.add_attributes(type, *addresses)
if display_names:
self.add_attributes("{}-display-name".format(type), *display_names)
self.add_attributes(typ, *addresses)
if insert_display_names and display_names:
self.add_attributes("{}-display-name".format(typ), *display_names)
def __generate_received(self):
"""
Extract IP addresses from received headers that are not private.
"""
for received in self.__parser.received:
tokens = received["from"].split(" ")
ip = None
for token in tokens:
try:
ip = ipaddress.ip_address(token)
break
except ValueError:
pass # token is not IP address
if not ip or ip.is_private:
continue # skip header if IP not found or is private
self.add_attribute("received-header-ip", value=str(ip), comment=received["from"])

View File

@ -46,6 +46,7 @@ requests = "^2.22.0"
python-dateutil = "^2.8.1"
jsonschema = "^3.2.0"
deprecated = "^1.2.7"
mail-parser = {version = "3.12.0"}
python-magic = {version = "^0.4.15", optional = true}
pydeep = {version = "^0.4", optional = true}
lief = {version = "^0.10.1", optional = true}