fix: Somewhat broken emails needed some love

pull/360/head
Raphaël Vinot 2019-12-05 19:11:01 +01:00
parent 6fcd9c9b8d
commit b70c32af7b
1 changed files with 24 additions and 18 deletions

View File

@ -111,6 +111,7 @@ def handler(q=False):
mail_body = email_object.email.get_body(preferencelist=('html', 'plain')) mail_body = email_object.email.get_body(preferencelist=('html', 'plain'))
if extract_urls: if extract_urls:
if mail_body:
charset = mail_body.get_content_charset() charset = mail_body.get_content_charset()
if mail_body.get_content_type() == 'text/html': if mail_body.get_content_type() == 'text/html':
url_parser = HTMLURLParser() url_parser = HTMLURLParser()
@ -213,16 +214,21 @@ def get_zip_passwords(message):
body = [] body = []
for part in message.walk(): for part in message.walk():
charset = part.get_content_charset() charset = part.get_content_charset()
if not charset:
charset = "utf-8"
if part.get_content_type() == 'text/plain': if part.get_content_type() == 'text/plain':
body.append(part.get_payload(decode=True).decode(charset, errors='ignore')) body.append(part.get_payload(decode=True).decode(charset, errors='ignore'))
elif part.get_content_type() == 'text/html': elif part.get_content_type() == 'text/html':
html_parser = HTMLTextParser() html_parser = HTMLTextParser()
html_parser.feed(part.get_payload(decode=True).decode(charset, errors='ignore')) payload = part.get_payload(decode=True)
if payload:
html_parser.feed(payload.decode(charset, errors='ignore'))
for text in html_parser.text_data: for text in html_parser.text_data:
body.append(text) body.append(text)
raw_text = "\n".join(body).strip() raw_text = "\n".join(body).strip()
# Add subject to text corpus to parse # Add subject to text corpus to parse
if "Subject" in message:
subject = " " + message.get('Subject') subject = " " + message.get('Subject')
raw_text += subject raw_text += subject