4
4
from email .utils import parseaddr
5
5
from typing import Any
6
6
7
+ import bs4
8
+
7
9
from onyx .access .models import ExternalAccess
8
10
from onyx .configs .constants import DocumentSource
9
11
from onyx .connectors .credentials_provider import OnyxStaticCredentialsProvider
16
18
from onyx .connectors .models import BasicExpertInfo
17
19
from onyx .connectors .models import ConnectorCheckpoint
18
20
from onyx .connectors .models import Document
19
- from onyx .connectors .models import ImageSection
20
21
from onyx .connectors .models import TextSection
22
+ from onyx .utils .logger import setup_logger
21
23
from tests .daily .connectors .utils import load_all_docs_from_checkpoint_connector
22
24
25
+ logger = setup_logger ()
26
+
23
27
24
28
DEFAULT_IMAP_PORT_NUMBER = 993
25
29
IMAP_OKAY_STATUS = "OK"
@@ -122,19 +126,16 @@ def _convert_email_headers_and_body_into_document(
122
126
) -> Document :
123
127
_sender_name , sender_addr = parseaddr (addr = email_headers .sender )
124
128
recipient_name , recipient_addr = parseaddr (addr = email_headers .recipient )
125
-
126
- semantic_identifier = (
127
- f"{ sender_addr } to { recipient_addr } about { email_headers .subject } "
128
- )
129
-
130
- sections = _parse_email_body (email_msg = email_msg , email_headers = email_headers )
129
+ title = f"{ sender_addr } to { recipient_addr } about { email_headers .subject } "
130
+ email_body = _parse_email_body (email_msg = email_msg , email_headers = email_headers )
131
131
132
132
return Document (
133
- id = semantic_identifier ,
134
- semantic_identifier = semantic_identifier ,
133
+ id = email_headers .id ,
134
+ title = title ,
135
+ semantic_identifier = email_headers .subject ,
135
136
metadata = {},
136
137
source = DocumentSource .IMAP ,
137
- sections = sections ,
138
+ sections = [ TextSection ( text = email_body )] ,
138
139
primary_owners = [
139
140
BasicExpertInfo (
140
141
display_name = recipient_name ,
@@ -152,22 +153,37 @@ def _convert_email_headers_and_body_into_document(
152
153
def _parse_email_body (
153
154
email_msg : Message ,
154
155
email_headers : EmailHeaders ,
155
- ) -> list [TextSection | ImageSection ]:
156
- # _sender_name, sender_addr = parseaddr(email_headers.sender)
157
- # _recipient_name, recipient_addr = parseaddr(email_headers.recipient)
158
- # plain_text_body = ""
159
- # for part in msg.walk():
160
- # email_headers = EmailHeaders.from_email_msg()
161
- # ctype = part.get_content_type()
162
- # cdisp = str(part.get('Content-Disposition'))
163
- # if ctype == 'text/plain' and 'attachment' not in cdisp:
164
- # try:
165
- # plain_text_body = part.get_payload(decode=True).decode(part.get_content_charset() or 'utf-8')
166
- # except (UnicodeDecodeError, AttributeError):
167
- # plain_text_body = part.get_payload(decode=True).decode('latin-1', errors='ignore')
168
- # break
169
-
170
- raise NotImplementedError
156
+ ) -> str :
157
+ body = None
158
+ for part in email_msg .walk ():
159
+ if part .is_multipart ():
160
+ continue
161
+
162
+ charset = part .get_content_charset () or "utf-8"
163
+
164
+ try :
165
+ raw_payload = part .get_payload (decode = True )
166
+ if not isinstance (raw_payload , bytes ):
167
+ logger .warn (
168
+ "Payload section from email was expected to be an array of bytes, instead got "
169
+ f"{ type (raw_payload )= } , { raw_payload = } "
170
+ )
171
+ continue
172
+ body = raw_payload .decode (charset )
173
+ break
174
+ except (UnicodeDecodeError , LookupError ) as e :
175
+ print (f"Warning: Could not decode part with charset { charset } . Error: { e } " )
176
+ continue
177
+
178
+ if not body :
179
+ logger .warn (
180
+ f"Email with { email_headers .id = } has an empty body; returning an empty string"
181
+ )
182
+ return ""
183
+
184
+ soup = bs4 .BeautifulSoup (markup = body , features = "html.parser" )
185
+
186
+ return "" .join (str_section for str_section in soup .stripped_strings )
171
187
172
188
173
189
if __name__ == "__main__" :
0 commit comments