Skip to content

Commit 9590fac

Browse files
committed
Add email body parsing
1 parent dc04f85 commit 9590fac

File tree

1 file changed

+42
-26
lines changed

1 file changed

+42
-26
lines changed

backend/onyx/connectors/imap/connector.py

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from email.utils import parseaddr
55
from typing import Any
66

7+
import bs4
8+
79
from onyx.access.models import ExternalAccess
810
from onyx.configs.constants import DocumentSource
911
from onyx.connectors.credentials_provider import OnyxStaticCredentialsProvider
@@ -16,10 +18,12 @@
1618
from onyx.connectors.models import BasicExpertInfo
1719
from onyx.connectors.models import ConnectorCheckpoint
1820
from onyx.connectors.models import Document
19-
from onyx.connectors.models import ImageSection
2021
from onyx.connectors.models import TextSection
22+
from onyx.utils.logger import setup_logger
2123
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
2224

25+
logger = setup_logger()
26+
2327

2428
DEFAULT_IMAP_PORT_NUMBER = 993
2529
IMAP_OKAY_STATUS = "OK"
@@ -122,19 +126,16 @@ def _convert_email_headers_and_body_into_document(
122126
) -> Document:
123127
_sender_name, sender_addr = parseaddr(addr=email_headers.sender)
124128
recipient_name, recipient_addr = parseaddr(addr=email_headers.recipient)
125-
126-
semantic_identifier = (
127-
f"{sender_addr} to {recipient_addr} about {email_headers.subject}"
128-
)
129-
130-
sections = _parse_email_body(email_msg=email_msg, email_headers=email_headers)
129+
title = f"{sender_addr} to {recipient_addr} about {email_headers.subject}"
130+
email_body = _parse_email_body(email_msg=email_msg, email_headers=email_headers)
131131

132132
return Document(
133-
id=semantic_identifier,
134-
semantic_identifier=semantic_identifier,
133+
id=email_headers.id,
134+
title=title,
135+
semantic_identifier=email_headers.subject,
135136
metadata={},
136137
source=DocumentSource.IMAP,
137-
sections=sections,
138+
sections=[TextSection(text=email_body)],
138139
primary_owners=[
139140
BasicExpertInfo(
140141
display_name=recipient_name,
@@ -152,22 +153,37 @@ def _convert_email_headers_and_body_into_document(
152153
def _parse_email_body(
153154
email_msg: Message,
154155
email_headers: EmailHeaders,
155-
) -> list[TextSection | ImageSection]:
156-
# _sender_name, sender_addr = parseaddr(email_headers.sender)
157-
# _recipient_name, recipient_addr = parseaddr(email_headers.recipient)
158-
# plain_text_body = ""
159-
# for part in msg.walk():
160-
# email_headers = EmailHeaders.from_email_msg()
161-
# ctype = part.get_content_type()
162-
# cdisp = str(part.get('Content-Disposition'))
163-
# if ctype == 'text/plain' and 'attachment' not in cdisp:
164-
# try:
165-
# plain_text_body = part.get_payload(decode=True).decode(part.get_content_charset() or 'utf-8')
166-
# except (UnicodeDecodeError, AttributeError):
167-
# plain_text_body = part.get_payload(decode=True).decode('latin-1', errors='ignore')
168-
# break
169-
170-
raise NotImplementedError
156+
) -> str:
157+
body = None
158+
for part in email_msg.walk():
159+
if part.is_multipart():
160+
continue
161+
162+
charset = part.get_content_charset() or "utf-8"
163+
164+
try:
165+
raw_payload = part.get_payload(decode=True)
166+
if not isinstance(raw_payload, bytes):
167+
logger.warn(
168+
"Payload section from email was expected to be an array of bytes, instead got "
169+
f"{type(raw_payload)=}, {raw_payload=}"
170+
)
171+
continue
172+
body = raw_payload.decode(charset)
173+
break
174+
except (UnicodeDecodeError, LookupError) as e:
175+
print(f"Warning: Could not decode part with charset {charset}. Error: {e}")
176+
continue
177+
178+
if not body:
179+
logger.warn(
180+
f"Email with {email_headers.id=} has an empty body; returning an empty string"
181+
)
182+
return ""
183+
184+
soup = bs4.BeautifulSoup(markup=body, features="html.parser")
185+
186+
return "".join(str_section for str_section in soup.stripped_strings)
171187

172188

173189
if __name__ == "__main__":

0 commit comments

Comments
 (0)