Skip to content
This repository has been archived by the owner on Dec 11, 2021. It is now read-only.

Commit

Permalink
Handle badly declared charsets in message body
Browse files Browse the repository at this point in the history
  • Loading branch information
daggelpop committed Jun 23, 2021
1 parent 1b6bfc3 commit 536551e
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
11 changes: 10 additions & 1 deletion email2pdf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ from sys import platform as _platform
from urllib.error import URLError, HTTPError
from urllib.request import Request, urlopen
import argparse
import chardet
import email
import functools
import html
Expand Down Expand Up @@ -369,6 +370,14 @@ def handle_html_message_body(input_email, part):
charset = 'utf-8'
logger.info("Determined email is HTML with charset " + str(charset))

try:
payload_unicode = str(payload, charset)
except UnicodeDecodeError:
detection = chardet.detect(payload)
charset = detection["encoding"]
logger.info("Detected charset can't decode body; trying again with charset " + charset)
payload_unicode = str(payload, charset)

def cid_replace(cid_parts_used, matchobj):
cid = matchobj.group(1)

Expand All @@ -391,7 +400,7 @@ def handle_html_message_body(input_email, part):
return "broken"

payload = re.sub(r'cid:([\[email protected]]+)', functools.partial(cid_replace, cid_parts_used),
str(payload, charset))
payload_unicode)

return (payload, cid_parts_used)

Expand Down
12 changes: 12 additions & 0 deletions tests/Subprocess/test_Subprocess_MIME.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,18 @@ def test_plainandhtml(self):
self.assertFalse(self.existsByTimeWarning())
self.assertFalse(self.existsByTimeOriginal())

def test_wrong_charset_html(self):
self.addHeaders()
broken_body = b"<p>Something with raw accents: \xe9</p>"
self.attachHTML(broken_body, charset="utf-8")
(rc, output, error) = self.invokeAsSubprocess()
self.assertEqual(0, rc)
self.assertEqual('', error)
self.assertTrue(self.existsByTime())
self.assertRegex(self.getPDFText(self.getTimedFilename()), "Something\swith\sraw\saccents:\s\é")
self.assertFalse(self.existsByTimeWarning())
self.assertFalse(self.existsByTimeOriginal())

def test_pdf(self):
self.addHeaders()
self.attachText("Some basic textual content")
Expand Down

0 comments on commit 536551e

Please sign in to comment.