Skip to content

Commit 809122f

Browse files
raunakabWeves
andauthored
fix: Fix bug in which emails would be fetched during initial indexing (#4959)
* Add new convenience method * Fix bug in which emails would be fetched for initial indexing * Improve tests for MS Teams connector * Fix test_gdrive_perm_sync_with_real_data patching * Protect against incorrect truthiness --------- Co-authored-by: Weves <chrisweaver101@gmail.com>
1 parent c8741d8 commit 809122f

File tree

5 files changed

+212
-133
lines changed

5 files changed

+212
-133
lines changed

backend/onyx/access/models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ def truncate_set(s: set[str], max_len: int = 100) -> str:
4040
def num_entries(self) -> int:
4141
return len(self.external_user_emails) + len(self.external_user_group_ids)
4242

43+
@classmethod
44+
def public(cls) -> "ExternalAccess":
45+
return cls(
46+
external_user_emails=set(),
47+
external_user_group_ids=set(),
48+
is_public=True,
49+
)
50+
4351
@classmethod
4452
def empty(cls) -> "ExternalAccess":
4553
"""

backend/onyx/connectors/teams/connector.py

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from office365.teams.channels.channel import Channel # type: ignore
1414
from office365.teams.team import Team # type: ignore
1515

16-
from onyx.access.models import ExternalAccess
1716
from onyx.configs.constants import DocumentSource
1817
from onyx.connectors.exceptions import ConnectorValidationError
1918
from onyx.connectors.exceptions import CredentialExpiredError
@@ -33,6 +32,7 @@
3332
from onyx.connectors.models import TextSection
3433
from onyx.connectors.teams.models import Message
3534
from onyx.connectors.teams.utils import fetch_expert_infos
35+
from onyx.connectors.teams.utils import fetch_external_access
3636
from onyx.connectors.teams.utils import fetch_messages
3737
from onyx.connectors.teams.utils import fetch_replies
3838
from onyx.file_processing.html_utils import parse_html_page_basic
@@ -43,7 +43,6 @@
4343
logger = setup_logger()
4444

4545
_SLIM_DOC_BATCH_SIZE = 5000
46-
_PUBLIC_MEMBERSHIP_TYPE = "standard" # public teams channel
4746

4847

4948
class TeamsCheckpoint(ConnectorCheckpoint):
@@ -260,18 +259,8 @@ def retrieve_all_slim_documents(
260259
)
261260
continue
262261

263-
is_public = _is_channel_public(channel=channel)
264-
expert_infos = (
265-
set()
266-
if is_public
267-
else fetch_expert_infos(
268-
graph_client=self.graph_client, channel=channel
269-
)
270-
)
271-
external_user_emails = set(
272-
expert_info.email
273-
for expert_info in expert_infos
274-
if expert_info.email
262+
external_access = fetch_external_access(
263+
graph_client=self.graph_client, channel=channel
275264
)
276265

277266
messages = fetch_messages(
@@ -287,11 +276,7 @@ def retrieve_all_slim_documents(
287276
slim_doc_buffer.append(
288277
SlimDocument(
289278
id=message.id,
290-
external_access=ExternalAccess(
291-
external_user_emails=external_user_emails,
292-
external_user_group_ids=set(),
293-
is_public=is_public,
294-
),
279+
external_access=external_access,
295280
)
296281
)
297282

@@ -336,9 +321,6 @@ def _convert_thread_to_document(
336321
if len(thread) == 0:
337322
return None
338323

339-
expert_infos = fetch_expert_infos(graph_client=graph_client, channel=channel)
340-
emails = set(expert_info.email for expert_info in expert_infos if expert_info.email)
341-
342324
most_recent_message_datetime: datetime | None = None
343325
top_message = thread[0]
344326
thread_text = ""
@@ -361,7 +343,10 @@ def _convert_thread_to_document(
361343
return None
362344

363345
semantic_string = _construct_semantic_identifier(channel, top_message)
364-
is_public = _is_channel_public(channel=channel)
346+
expert_infos = fetch_expert_infos(graph_client=graph_client, channel=channel)
347+
external_access = fetch_external_access(
348+
graph_client=graph_client, channel=channel, expert_infos=expert_infos
349+
)
365350

366351
return Document(
367352
id=top_message.id,
@@ -372,11 +357,7 @@ def _convert_thread_to_document(
372357
doc_updated_at=most_recent_message_datetime,
373358
primary_owners=expert_infos,
374359
metadata={},
375-
external_access=ExternalAccess(
376-
external_user_emails=emails,
377-
external_user_group_ids=set(),
378-
is_public=is_public,
379-
),
360+
external_access=external_access,
380361
)
381362

382363

@@ -558,12 +539,6 @@ def _collect_documents_for_channel(
558539
)
559540

560541

561-
def _is_channel_public(channel: Channel) -> bool:
562-
return (
563-
channel.membership_type and channel.membership_type == _PUBLIC_MEMBERSHIP_TYPE
564-
)
565-
566-
567542
if __name__ == "__main__":
568543
from tests.daily.connectors.utils import load_everything_from_checkpoint_connector
569544

backend/onyx/connectors/teams/utils.py

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from office365.teams.channels.channel import Channel # type: ignore
99
from office365.teams.channels.channel import ConversationMember # type: ignore
1010

11+
from onyx.access.models import ExternalAccess
1112
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
1213
from onyx.connectors.models import BasicExpertInfo
1314
from onyx.connectors.teams.models import Message
@@ -16,6 +17,9 @@
1617
logger = setup_logger()
1718

1819

20+
_PUBLIC_MEMBERSHIP_TYPE = "standard" # public teams channel
21+
22+
1923
def _retry(
2024
graph_client: GraphClient,
2125
request_url: str,
@@ -64,6 +68,34 @@ def _get_next_url(
6468
return next_url.removeprefix(graph_client.service_root_url()).removeprefix("/")
6569

6670

71+
def _get_or_fetch_email(
72+
graph_client: GraphClient,
73+
member: ConversationMember,
74+
) -> str | None:
75+
if email := member.properties.get("email"):
76+
return email
77+
78+
user_id = member.properties.get("userId")
79+
if not user_id:
80+
logger.warn(f"No user-id found for this member; {member=}")
81+
return None
82+
83+
json_data = _retry(graph_client=graph_client, request_url=f"users/{user_id}")
84+
email = json_data.get("userPrincipalName")
85+
86+
if not isinstance(email, str):
87+
logger.warn(f"Expected email to be of type str, instead got {email=}")
88+
return None
89+
90+
return email
91+
92+
93+
def _is_channel_public(channel: Channel) -> bool:
94+
return (
95+
channel.membership_type and channel.membership_type == _PUBLIC_MEMBERSHIP_TYPE
96+
)
97+
98+
6799
def fetch_messages(
68100
graph_client: GraphClient,
69101
team_id: str,
@@ -115,28 +147,6 @@ def fetch_replies(
115147
)
116148

117149

118-
def _get_or_fetch_email(
119-
graph_client: GraphClient,
120-
member: ConversationMember,
121-
) -> str | None:
122-
if email := member.properties.get("email"):
123-
return email
124-
125-
user_id = member.properties.get("userId")
126-
if not user_id:
127-
logger.warn(f"No user-id found for this member; {member=}")
128-
return None
129-
130-
json_data = _retry(graph_client=graph_client, request_url=f"users/{user_id}")
131-
email = json_data.get("userPrincipalName")
132-
133-
if not isinstance(email, str):
134-
logger.warn(f"Expected email to be of type str, instead got {email=}")
135-
return None
136-
137-
return email
138-
139-
140150
def fetch_expert_infos(
141151
graph_client: GraphClient, channel: Channel
142152
) -> list[BasicExpertInfo]:
@@ -164,3 +174,27 @@ def fetch_expert_infos(
164174
)
165175

166176
return expert_infos
177+
178+
179+
def fetch_external_access(
180+
graph_client: GraphClient,
181+
channel: Channel,
182+
expert_infos: list[BasicExpertInfo] | None = None,
183+
) -> ExternalAccess:
184+
is_public = _is_channel_public(channel=channel)
185+
186+
if is_public:
187+
return ExternalAccess.public()
188+
189+
expert_infos = (
190+
expert_infos
191+
if expert_infos is not None
192+
else fetch_expert_infos(graph_client=graph_client, channel=channel)
193+
)
194+
emails = {expert_info.email for expert_info in expert_infos if expert_info.email}
195+
196+
return ExternalAccess(
197+
external_user_emails=emails,
198+
external_user_group_ids=set(),
199+
is_public=is_public,
200+
)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pydantic import BaseModel
2+
3+
from onyx.access.models import ExternalAccess
4+
from onyx.connectors.models import Document
5+
6+
7+
class TeamsThread(BaseModel):
8+
thread: str
9+
external_access: ExternalAccess
10+
11+
@classmethod
12+
def from_doc(cls, document: Document) -> "TeamsThread":
13+
14+
assert (
15+
document.external_access
16+
), f"ExternalAccess should always be available, instead got {document=}"
17+
18+
return cls(
19+
thread=document.get_text_content(),
20+
external_access=document.external_access,
21+
)

0 commit comments

Comments
 (0)