diff --git a/ingestors/email/msg.py b/ingestors/email/msg.py index 0cfbccabc..c7a42d803 100644 --- a/ingestors/email/msg.py +++ b/ingestors/email/msg.py @@ -1,5 +1,7 @@ +import re import email import logging +from email.header import decode_header, make_header from email.policy import default from email.errors import MessageError from html import escape @@ -13,6 +15,16 @@ log = logging.getLogger(__name__) +linesep_splitter = re.compile(r"\n|\r") + + +def my_header_fetch_parse(name, value): + if hasattr(value, "name"): + return value + v = str(make_header(decode_header(value))) + v = "".join(linesep_splitter.split(v)) + return email.policy.default.header_factory(name, v) + class RFC822Ingestor(Ingestor, EmailSupport, EncodingSupport): MIME_TYPES = ["multipart/mixed", "message/rfc822"] @@ -131,7 +143,8 @@ def ingest(self, file_path, entity): entity.schema = model.get("Email") try: with open(file_path, "rb") as fh: - msg = email.message_from_binary_file(fh, policy=default) + policy = default.clone(header_fetch_parse=my_header_fetch_parse) + msg = email.message_from_binary_file(fh, policy=policy) except (MessageError, ValueError, IndexError) as err: raise ProcessingException("Cannot parse email: %s" % err) from err diff --git a/tests/fixtures/email_multiline_headers.eml b/tests/fixtures/email_multiline_headers.eml new file mode 100644 index 000000000..d0ae80a35 --- /dev/null +++ b/tests/fixtures/email_multiline_headers.eml @@ -0,0 +1,27 @@ +From redacted@example.com Mon Sep 26 01:29:17 2016 +Return-Path: +MIME-Version: 1.0 +Content-Type: multipart/mixed; + boundary="=_cdba8b1f8013db8af57404a79c9f2707" +Date: Fri, 29 Apr 2016 11:17:12 +0300 +From: =?UTF-8?Q?=D0=9E=D1=82=D0=B4=D0=B5=D0=BB_=D0=BF=D0=BE_=D1=80=D0=B0?= + =?UTF-8?Q?=D0=B1=D0=BE=D1=82=D0=B5_=D1=81_=D0=BF=D1=80=D0=BE=D1=85=D0=BE?= + =?UTF-8?Q?=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC_=D0=B7=D0=B0=D0=BA?= + =?UTF-8?Q?=D0=BE=D0=BD=D0=BE=D0=BF=D1=80=D0=BE=D0=B5=D0=BA=D1=82=D0=BE?= + =?UTF-8?Q?=D0=B2?= +To: =?UTF-8?Q?=D0=91=D0=BE=D1=80=D0=B8=D1=81=D0=BE=D0=B2?= + +Subject: =?UTF-8?Q?=D0=94=D0=BE=D0=BF=D0=BE=D0=BB=D0=BD=D0=B8=D1=82=D0=B5?= + =?UTF-8?Q?=D0=BB=D1=8C=D0=BD=D0=BE=D0=B5_=D0=B7=D0=B0=D0=BA=D0=BB=D1=8E?= + =?UTF-8?Q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5_=D0=9F=D1=80=D0=BE=D1=84=D0=B8?= + =?UTF-8?Q?=D0=BB=D1=8C=D0=BD=D0=BE=D0=B3=D0=BE_=D0=9A=D0=BE=D0=BC=D0=B8?= + =?UTF-8?Q?=D1=82=D0=B5=D1=82=D0=B0_=D0=BD=D0=B0_=D0=BF=D1=80=D0=BE=D0=B5?= + =?UTF-8?Q?=D0=BA=D1=82_=D0=B7=D0=B0=D0=BA=D0=BE=D0=BD=D0=B0_=E2=84=96145-?= + =?UTF-8?Q?=D0=94?= +Message-ID: +X-Sender: redacted@example.com +User-Agent: Roundcube Webmail/1.1.1 + +--=_cdba8b1f8013db8af57404a79c9f2707 +--=_cdba8b1f8013db8af57404a79c9f2707-- + diff --git a/tests/test_msg.py b/tests/test_msg.py index 6bb5cde16..1eb04365f 100644 --- a/tests/test_msg.py +++ b/tests/test_msg.py @@ -167,3 +167,20 @@ def test_attached_inline_email(self): "This is the body of a plaintext message.", ], ) + + def test_headers(self): + fixture_path, entity = self.fixture("email_multiline_headers.eml") + self.manager.ingest(fixture_path, entity) + self.assertSuccess(entity) + self.assertEqual( + entity.get("from"), + [ + "Отдел по работе с прохождением законопроектов ", + ], + ) + self.assertEqual( + entity.get("subject"), + [ + "Дополнительное заключение Профильного Комитета на проект закона №145-Д", + ], + )