fix: Somewhat broken emails needed some love

pull/360/head
Raphaël Vinot 2019-12-05 19:11:01 +01:00
parent 6fcd9c9b8d
commit b70c32af7b
1 changed files with 24 additions and 18 deletions

View File

@ -111,19 +111,20 @@ def handler(q=False):
mail_body = email_object.email.get_body(preferencelist=('html', 'plain')) mail_body = email_object.email.get_body(preferencelist=('html', 'plain'))
if extract_urls: if extract_urls:
charset = mail_body.get_content_charset() if mail_body:
if mail_body.get_content_type() == 'text/html': charset = mail_body.get_content_charset()
url_parser = HTMLURLParser() if mail_body.get_content_type() == 'text/html':
url_parser.feed(mail_body.get_payload(decode=True).decode(charset, errors='ignore')) url_parser = HTMLURLParser()
urls = url_parser.urls url_parser.feed(mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
else: urls = url_parser.urls
urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', mail_body.get_payload(decode=True).decode(charset, errors='ignore')) else:
for url in urls: urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
if not url: for url in urls:
continue if not url:
url_object = URLObject(url, standalone=False) continue
file_objects.append(url_object) url_object = URLObject(url, standalone=False)
email_object.add_reference(url_object.uuid, 'includes', 'URL in email body') file_objects.append(url_object)
email_object.add_reference(url_object.uuid, 'includes', 'URL in email body')
objects = [email_object.to_json()] objects = [email_object.to_json()]
if file_objects: if file_objects:
@ -213,18 +214,23 @@ def get_zip_passwords(message):
body = [] body = []
for part in message.walk(): for part in message.walk():
charset = part.get_content_charset() charset = part.get_content_charset()
if not charset:
charset = "utf-8"
if part.get_content_type() == 'text/plain': if part.get_content_type() == 'text/plain':
body.append(part.get_payload(decode=True).decode(charset, errors='ignore')) body.append(part.get_payload(decode=True).decode(charset, errors='ignore'))
elif part.get_content_type() == 'text/html': elif part.get_content_type() == 'text/html':
html_parser = HTMLTextParser() html_parser = HTMLTextParser()
html_parser.feed(part.get_payload(decode=True).decode(charset, errors='ignore')) payload = part.get_payload(decode=True)
for text in html_parser.text_data: if payload:
body.append(text) html_parser.feed(payload.decode(charset, errors='ignore'))
for text in html_parser.text_data:
body.append(text)
raw_text = "\n".join(body).strip() raw_text = "\n".join(body).strip()
# Add subject to text corpus to parse # Add subject to text corpus to parse
subject = " " + message.get('Subject') if "Subject" in message:
raw_text += subject subject = " " + message.get('Subject')
raw_text += subject
# Grab any strings that are marked off by special chars # Grab any strings that are marked off by special chars
marking_chars = [["\'", "\'"], ['"', '"'], ['[', ']'], ['(', ')']] marking_chars = [["\'", "\'"], ['"', '"'], ['[', ']'], ['(', ')']]