fix: Somewhat broken emails needed some love

2019-12-05 19:11:01 +01:00 · 2019-12-05 19:11:01 +01:00 · b70c32af7b
parent 6fcd9c9b8d
commit b70c32af7b
1 changed files with 24 additions and 18 deletions
--- a/misp_modules/modules/import_mod/email_import.py
+++ b/misp_modules/modules/import_mod/email_import.py
@ -111,19 +111,20 @@ def handler(q=False):

    mail_body = email_object.email.get_body(preferencelist=('html', 'plain'))
    if extract_urls:
-        charset = mail_body.get_content_charset()
-        if mail_body.get_content_type() == 'text/html':
-            url_parser = HTMLURLParser()
-            url_parser.feed(mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
-            urls = url_parser.urls
-        else:
-            urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
-        for url in urls:
-            if not url:
-                continue
-            url_object = URLObject(url, standalone=False)
-            file_objects.append(url_object)
-            email_object.add_reference(url_object.uuid, 'includes', 'URL in email body')
+        if mail_body:
+            charset = mail_body.get_content_charset()
+            if mail_body.get_content_type() == 'text/html':
+                url_parser = HTMLURLParser()
+                url_parser.feed(mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
+                urls = url_parser.urls
+            else:
+                urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
+            for url in urls:
+                if not url:
+                    continue
+                url_object = URLObject(url, standalone=False)
+                file_objects.append(url_object)
+                email_object.add_reference(url_object.uuid, 'includes', 'URL in email body')

    objects = [email_object.to_json()]
    if file_objects:
@ -213,18 +214,23 @@ def get_zip_passwords(message):
    body = []
    for part in message.walk():
        charset = part.get_content_charset()
+        if not charset:
+            charset = "utf-8"
        if part.get_content_type() == 'text/plain':
            body.append(part.get_payload(decode=True).decode(charset, errors='ignore'))
        elif part.get_content_type() == 'text/html':
            html_parser = HTMLTextParser()
-            html_parser.feed(part.get_payload(decode=True).decode(charset, errors='ignore'))
-            for text in html_parser.text_data:
-                body.append(text)
+            payload = part.get_payload(decode=True)
+            if payload:
+                html_parser.feed(payload.decode(charset, errors='ignore'))
+                for text in html_parser.text_data:
+                    body.append(text)
    raw_text = "\n".join(body).strip()

    # Add subject to text corpus to parse
-    subject = " " + message.get('Subject')
-    raw_text += subject
+    if "Subject" in message:
+        subject = " " + message.get('Subject')
+        raw_text += subject

    # Grab any strings that are marked off by special chars
    marking_chars = [["\'", "\'"], ['"', '"'], ['[', ']'], ['(', ')']]