fix: Consider mail body as UTF-8 encoded

pull/474/head
Jakub Onderka 2021-03-02 15:03:15 +01:00 committed by GitHub
parent 3e168aceb8
commit 38457f0a7b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 14 additions and 15 deletions

View File

@ -110,21 +110,20 @@ def handler(q=False):
email_object.add_reference(f_object.uuid, 'includes', 'Email attachment')
mail_body = email_object.email.get_body(preferencelist=('html', 'plain'))
if extract_urls:
if mail_body:
charset = mail_body.get_content_charset()
if mail_body.get_content_type() == 'text/html':
url_parser = HTMLURLParser()
url_parser.feed(mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
urls = url_parser.urls
else:
urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
for url in urls:
if not url:
continue
url_object = URLObject(url, standalone=False)
file_objects.append(url_object)
email_object.add_reference(url_object.uuid, 'includes', 'URL in email body')
if extract_urls and mail_body:
charset = mail_body.get_content_charset('utf-8')
if mail_body.get_content_type() == 'text/html':
url_parser = HTMLURLParser()
url_parser.feed(mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
urls = url_parser.urls
else:
urls = re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', mail_body.get_payload(decode=True).decode(charset, errors='ignore'))
for url in urls:
if not url:
continue
url_object = URLObject(url, standalone=False)
file_objects.append(url_object)
email_object.add_reference(url_object.uuid, 'includes', 'URL in email body')
objects = [email_object.to_json()]
if file_objects: