diff --git a/email2pdf b/email2pdf index 5a8b136..ef49c85 100755 --- a/email2pdf +++ b/email2pdf @@ -8,6 +8,7 @@ from sys import platform as _platform from urllib.error import URLError, HTTPError from urllib.request import Request, urlopen import argparse +import chardet import email import functools import html @@ -369,6 +370,14 @@ def handle_html_message_body(input_email, part): charset = 'utf-8' logger.info("Determined email is HTML with charset " + str(charset)) + try: + payload_unicode = str(payload, charset) + except UnicodeDecodeError: + detection = chardet.detect(payload) + charset = detection["encoding"] + logger.info("Detected charset can't decode body; trying again with charset " + charset) + payload_unicode = str(payload, charset) + def cid_replace(cid_parts_used, matchobj): cid = matchobj.group(1) @@ -391,7 +400,7 @@ def handle_html_message_body(input_email, part): return "broken" payload = re.sub(r'cid:([\w_@.-]+)', functools.partial(cid_replace, cid_parts_used), - str(payload, charset)) + payload_unicode) return (payload, cid_parts_used) diff --git a/tests/Subprocess/test_Subprocess_MIME.py b/tests/Subprocess/test_Subprocess_MIME.py index 210f48e..910b878 100644 --- a/tests/Subprocess/test_Subprocess_MIME.py +++ b/tests/Subprocess/test_Subprocess_MIME.py @@ -141,6 +141,18 @@ def test_plainandhtml(self): self.assertFalse(self.existsByTimeWarning()) self.assertFalse(self.existsByTimeOriginal()) + def test_wrong_charset_html(self): + self.addHeaders() + broken_body = b"

Something with raw accents: \xe9

" + self.attachHTML(broken_body, charset="utf-8") + (rc, output, error) = self.invokeAsSubprocess() + self.assertEqual(0, rc) + self.assertEqual('', error) + self.assertTrue(self.existsByTime()) + self.assertRegex(self.getPDFText(self.getTimedFilename()), "Something\swith\sraw\saccents:\s\é") + self.assertFalse(self.existsByTimeWarning()) + self.assertFalse(self.existsByTimeOriginal()) + def test_pdf(self): self.addHeaders() self.attachText("Some basic textual content")