From e74abdd897e9c4e56b30bd1f69af1cd0b2b0099c Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Fri, 27 Jun 2025 13:43:47 -0700 Subject: [PATCH 1/5] Add new convenience method --- backend/onyx/access/models.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/onyx/access/models.py b/backend/onyx/access/models.py index 2fa23465443..765306d543d 100644 --- a/backend/onyx/access/models.py +++ b/backend/onyx/access/models.py @@ -40,6 +40,14 @@ def truncate_set(s: set[str], max_len: int = 100) -> str: def num_entries(self) -> int: return len(self.external_user_emails) + len(self.external_user_group_ids) + @classmethod + def public(cls) -> "ExternalAccess": + return cls( + external_user_emails=set(), + external_user_group_ids=set(), + is_public=True, + ) + @classmethod def empty(cls) -> "ExternalAccess": """ From 35bd540d70a9b9b66d11a2a7da2fff143818ad0b Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Fri, 27 Jun 2025 13:45:39 -0700 Subject: [PATCH 2/5] Fix bug in which emails would be fetched for initial indexing --- backend/onyx/connectors/teams/connector.py | 43 +++--------- backend/onyx/connectors/teams/utils.py | 78 ++++++++++++++++------ 2 files changed, 65 insertions(+), 56 deletions(-) diff --git a/backend/onyx/connectors/teams/connector.py b/backend/onyx/connectors/teams/connector.py index 8fe79653bef..ae1f6dc7156 100644 --- a/backend/onyx/connectors/teams/connector.py +++ b/backend/onyx/connectors/teams/connector.py @@ -13,7 +13,6 @@ from office365.teams.channels.channel import Channel # type: ignore from office365.teams.team import Team # type: ignore -from onyx.access.models import ExternalAccess from onyx.configs.constants import DocumentSource from onyx.connectors.exceptions import ConnectorValidationError from onyx.connectors.exceptions import CredentialExpiredError @@ -33,6 +32,7 @@ from onyx.connectors.models import TextSection from onyx.connectors.teams.models import Message from onyx.connectors.teams.utils import fetch_expert_infos +from onyx.connectors.teams.utils import fetch_external_access from onyx.connectors.teams.utils import fetch_messages from onyx.connectors.teams.utils import fetch_replies from onyx.file_processing.html_utils import parse_html_page_basic @@ -43,7 +43,6 @@ logger = setup_logger() _SLIM_DOC_BATCH_SIZE = 5000 -_PUBLIC_MEMBERSHIP_TYPE = "standard" # public teams channel class TeamsCheckpoint(ConnectorCheckpoint): @@ -260,18 +259,8 @@ def retrieve_all_slim_documents( ) continue - is_public = _is_channel_public(channel=channel) - expert_infos = ( - set() - if is_public - else fetch_expert_infos( - graph_client=self.graph_client, channel=channel - ) - ) - external_user_emails = set( - expert_info.email - for expert_info in expert_infos - if expert_info.email + external_access = fetch_external_access( + graph_client=self.graph_client, channel=channel ) messages = fetch_messages( @@ -287,11 +276,7 @@ def retrieve_all_slim_documents( slim_doc_buffer.append( SlimDocument( id=message.id, - external_access=ExternalAccess( - external_user_emails=external_user_emails, - external_user_group_ids=set(), - is_public=is_public, - ), + external_access=external_access, ) ) @@ -336,9 +321,6 @@ def _convert_thread_to_document( if len(thread) == 0: return None - expert_infos = fetch_expert_infos(graph_client=graph_client, channel=channel) - emails = set(expert_info.email for expert_info in expert_infos if expert_info.email) - most_recent_message_datetime: datetime | None = None top_message = thread[0] thread_text = "" @@ -361,7 +343,10 @@ def _convert_thread_to_document( return None semantic_string = _construct_semantic_identifier(channel, top_message) - is_public = _is_channel_public(channel=channel) + expert_infos = fetch_expert_infos(graph_client=graph_client, channel=channel) + external_access = fetch_external_access( + graph_client=graph_client, channel=channel, expert_infos=expert_infos + ) return Document( id=top_message.id, @@ -372,11 +357,7 @@ def _convert_thread_to_document( doc_updated_at=most_recent_message_datetime, primary_owners=expert_infos, metadata={}, - external_access=ExternalAccess( - external_user_emails=emails, - external_user_group_ids=set(), - is_public=is_public, - ), + external_access=external_access, ) @@ -558,12 +539,6 @@ def _collect_documents_for_channel( ) -def _is_channel_public(channel: Channel) -> bool: - return ( - channel.membership_type and channel.membership_type == _PUBLIC_MEMBERSHIP_TYPE - ) - - if __name__ == "__main__": from tests.daily.connectors.utils import load_everything_from_checkpoint_connector diff --git a/backend/onyx/connectors/teams/utils.py b/backend/onyx/connectors/teams/utils.py index 8be8817586e..f0392451063 100644 --- a/backend/onyx/connectors/teams/utils.py +++ b/backend/onyx/connectors/teams/utils.py @@ -8,6 +8,7 @@ from office365.teams.channels.channel import Channel # type: ignore from office365.teams.channels.channel import ConversationMember # type: ignore +from onyx.access.models import ExternalAccess from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.teams.models import Message @@ -16,6 +17,9 @@ logger = setup_logger() +_PUBLIC_MEMBERSHIP_TYPE = "standard" # public teams channel + + def _retry( graph_client: GraphClient, request_url: str, @@ -64,6 +68,34 @@ def _get_next_url( return next_url.removeprefix(graph_client.service_root_url()).removeprefix("/") +def _get_or_fetch_email( + graph_client: GraphClient, + member: ConversationMember, +) -> str | None: + if email := member.properties.get("email"): + return email + + user_id = member.properties.get("userId") + if not user_id: + logger.warn(f"No user-id found for this member; {member=}") + return None + + json_data = _retry(graph_client=graph_client, request_url=f"users/{user_id}") + email = json_data.get("userPrincipalName") + + if not isinstance(email, str): + logger.warn(f"Expected email to be of type str, instead got {email=}") + return None + + return email + + +def _is_channel_public(channel: Channel) -> bool: + return ( + channel.membership_type and channel.membership_type == _PUBLIC_MEMBERSHIP_TYPE + ) + + def fetch_messages( graph_client: GraphClient, team_id: str, @@ -115,28 +147,6 @@ def fetch_replies( ) -def _get_or_fetch_email( - graph_client: GraphClient, - member: ConversationMember, -) -> str | None: - if email := member.properties.get("email"): - return email - - user_id = member.properties.get("userId") - if not user_id: - logger.warn(f"No user-id found for this member; {member=}") - return None - - json_data = _retry(graph_client=graph_client, request_url=f"users/{user_id}") - email = json_data.get("userPrincipalName") - - if not isinstance(email, str): - logger.warn(f"Expected email to be of type str, instead got {email=}") - return None - - return email - - def fetch_expert_infos( graph_client: GraphClient, channel: Channel ) -> list[BasicExpertInfo]: @@ -164,3 +174,27 @@ def fetch_expert_infos( ) return expert_infos + + +def fetch_external_access( + graph_client: GraphClient, + channel: Channel, + expert_infos: list[BasicExpertInfo] | None = None, +) -> ExternalAccess: + is_public = _is_channel_public(channel=channel) + + if is_public: + return ExternalAccess.public() + + expert_infos = ( + expert_infos + if expert_infos + else fetch_expert_infos(graph_client=graph_client, channel=channel) + ) + emails = {expert_info.email for expert_info in expert_infos if expert_info.email} + + return ExternalAccess( + external_user_emails=emails, + external_user_group_ids=set(), + is_public=is_public, + ) From 248b8e227d15f4fb387fe2bc2541798fc32bd3da Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Fri, 27 Jun 2025 14:19:17 -0700 Subject: [PATCH 3/5] Improve tests for MS Teams connector --- .../tests/daily/connectors/teams/models.py | 21 ++ .../connectors/teams/test_teams_connector.py | 195 +++++++++++------- 2 files changed, 139 insertions(+), 77 deletions(-) create mode 100644 backend/tests/daily/connectors/teams/models.py diff --git a/backend/tests/daily/connectors/teams/models.py b/backend/tests/daily/connectors/teams/models.py new file mode 100644 index 00000000000..d7f99995179 --- /dev/null +++ b/backend/tests/daily/connectors/teams/models.py @@ -0,0 +1,21 @@ +from pydantic import BaseModel + +from onyx.access.models import ExternalAccess +from onyx.connectors.models import Document + + +class TeamsThread(BaseModel): + thread: str + external_access: ExternalAccess + + @classmethod + def from_doc(cls, document: Document) -> "TeamsThread": + + assert ( + document.external_access + ), f"ExternalAccess should always be available, instead got {document=}" + + return cls( + thread=document.get_text_content(), + external_access=document.external_access, + ) diff --git a/backend/tests/daily/connectors/teams/test_teams_connector.py b/backend/tests/daily/connectors/teams/test_teams_connector.py index 4a8a6c320c9..ccf61434ff4 100644 --- a/backend/tests/daily/connectors/teams/test_teams_connector.py +++ b/backend/tests/daily/connectors/teams/test_teams_connector.py @@ -2,14 +2,76 @@ import time import pytest -from pydantic import BaseModel -from onyx.connectors.models import Document +from onyx.access.models import ExternalAccess from onyx.connectors.teams.connector import TeamsConnector +from tests.daily.connectors.teams.models import TeamsThread from tests.daily.connectors.utils import load_everything_from_checkpoint_connector from tests.daily.connectors.utils import to_documents +TEAMS_THREAD = [ + # Posted in "Public Channel" + TeamsThread( + thread="This is the first message in Onyx-Testing ...This is a reply!This is a second reply.Third.4th.5", + external_access=ExternalAccess( + external_user_emails=set(), + external_user_group_ids=set(), + is_public=True, + ), + ), + TeamsThread( + thread="Testing body.", + external_access=ExternalAccess( + external_user_emails=set(), + external_user_group_ids=set(), + is_public=True, + ), + ), + TeamsThread( + thread="Hello, world! Nice to meet you all.", + external_access=ExternalAccess( + external_user_emails=set(), + external_user_group_ids=set(), + is_public=True, + ), + ), + # Posted in "Private Channel (Raunak is excluded)" + TeamsThread( + thread="This is a test post. Raunak should not be able to see this!", + external_access=ExternalAccess( + external_user_emails=set(["test@danswerai.onmicrosoft.com"]), + external_user_group_ids=set(), + is_public=False, + ), + ), + # Posted in "Private Channel (Raunak is a member)" + TeamsThread( + thread="This is a test post in a private channel that Raunak does have access to! Hello, Raunak!" + "Hello, world! I am just a member in this chat, but not an owner.", + external_access=ExternalAccess( + external_user_emails=set( + ["test@danswerai.onmicrosoft.com", "raunak@onyx.app"] + ), + external_user_group_ids=set(), + is_public=False, + ), + ), + # Posted in "Private Channel (Raunak owns)" + TeamsThread( + thread="This is a test post in a private channel that Raunak is an owner of! Whoa!" + "Hello, world! I am an owner of this chat. The power!", + external_access=ExternalAccess( + external_user_emails=set( + ["test@danswerai.onmicrosoft.com", "raunak@onyx.app"] + ), + external_user_group_ids=set(), + is_public=False, + ), + ), +] + + @pytest.fixture def teams_credentials() -> dict[str, str]: app_id = os.environ["TEAMS_APPLICATION_ID"] @@ -32,24 +94,6 @@ def teams_connector( return teams_connector -class TeamsThread(BaseModel): - thread: str - member_emails: set[str] - is_public: bool - - -def _doc_to_teams_thread(doc: Document) -> TeamsThread: - assert ( - doc.external_access - ), f"ExternalAccess should always be available, instead got {doc=}" - - return TeamsThread( - thread=doc.get_text_content(), - member_emails=doc.external_access.external_user_emails, - is_public=doc.external_access.is_public, - ) - - def _build_map(threads: list[TeamsThread]) -> dict[str, TeamsThread]: map: dict[str, TeamsThread] = {} @@ -60,68 +104,65 @@ def _build_map(threads: list[TeamsThread]) -> dict[str, TeamsThread]: return map +def _assert_is_valid_external_access( + external_access: ExternalAccess, +) -> None: + assert ( + not external_access.external_user_group_ids + ), f"{external_access.external_user_group_ids=} should be empty for MS Teams" + + if external_access.is_public: + assert ( + not external_access.external_user_emails + ), f"{external_access.external_user_emails=} should be empty for public channels" + else: + assert ( + external_access.external_user_emails + ), f"{external_access.external_user_emails=} should contains at least one user for private channels" + + @pytest.mark.parametrize( - "expected_docs", - [ - [ - # Posted in "Public Channel" - TeamsThread( - thread="This is the first message in Onyx-Testing ...This is a reply!This is a second reply.Third.4th.5", - member_emails=set(), - is_public=True, - ), - TeamsThread( - thread="Testing body.", - member_emails=set(), - is_public=True, - ), - TeamsThread( - thread="Hello, world! Nice to meet you all.", - member_emails=set(), - is_public=True, - ), - # Posted in "Private Channel (Raunak is excluded)" - TeamsThread( - thread="This is a test post. Raunak should not be able to see this!", - member_emails=set(["test@danswerai.onmicrosoft.com"]), - is_public=False, - ), - # Posted in "Private Channel (Raunak is a member)" - TeamsThread( - thread="This is a test post in a private channel that Raunak does have access to! Hello, Raunak!" - "Hello, world! I am just a member in this chat, but not an owner.", - member_emails=set( - ["test@danswerai.onmicrosoft.com", "raunak@onyx.app"] - ), - is_public=False, - ), - # Posted in "Private Channel (Raunak owns)" - TeamsThread( - thread="This is a test post in a private channel that Raunak is an owner of! Whoa!" - "Hello, world! I am an owner of this chat. The power!", - member_emails=set( - ["test@danswerai.onmicrosoft.com", "raunak@onyx.app"] - ), - is_public=False, - ), - ], - ], + "expected_teams_threads", + [TEAMS_THREAD], ) -def test_teams_connector( +def test_loading_all_docs_from_teams_connector( teams_connector: TeamsConnector, - expected_docs: list[TeamsThread], + expected_teams_threads: list[TeamsThread], ) -> None: - docs_iter = load_everything_from_checkpoint_connector( - connector=teams_connector, - start=0.0, - end=time.time(), + docs = list( + to_documents( + iterator=iter( + load_everything_from_checkpoint_connector( + connector=teams_connector, + start=0.0, + end=time.time(), + ) + ) + ) ) + actual_teams_threads = [TeamsThread.from_doc(doc) for doc in docs] + actual_teams_threads_map = _build_map(threads=actual_teams_threads) + expected_teams_threads_map = _build_map(threads=expected_teams_threads) + + # Assert that each thread document matches what we expect. + assert actual_teams_threads_map == expected_teams_threads_map + + # Assert that all the `ExternalAccess` instances are well-formed. + for thread in actual_teams_threads: + _assert_is_valid_external_access(external_access=thread.external_access) - actual_docs = [ - _doc_to_teams_thread(doc=doc) for doc in to_documents(iterator=iter(docs_iter)) - ] - actual_docs_map = _build_map(threads=actual_docs) - expected_docs_map = _build_map(threads=expected_docs) +def test_slim_docs_retrieval_from_teams_connector( + teams_connector: TeamsConnector, +) -> None: + slim_docs = [ + slim_doc + for slim_doc_batch in teams_connector.retrieve_all_slim_documents() + for slim_doc in slim_doc_batch + ] - assert actual_docs_map == expected_docs_map + for slim_doc in slim_docs: + assert ( + slim_doc.external_access + ), f"ExternalAccess should always be available, instead got {slim_doc=}" + _assert_is_valid_external_access(external_access=slim_doc.external_access) From 3d7ea520079c88aadb181475e7bf749234b77f15 Mon Sep 17 00:00:00 2001 From: Weves Date: Fri, 27 Jun 2025 16:16:49 -0700 Subject: [PATCH 4/5] Fix test_gdrive_perm_sync_with_real_data patching --- .../connectors/google_drive/test_drive_perm_sync.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py b/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py index c7bf159385f..ea8517e9356 100644 --- a/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py +++ b/backend/tests/daily/connectors/google_drive/test_drive_perm_sync.py @@ -74,16 +74,17 @@ def test_gdrive_perm_sync_with_real_data( doc_access_generator = gdrive_doc_sync(mock_cc_pair, lambda: [], mock_heartbeat) doc_access_list = list(doc_access_generator) + # Verify we got some results + assert len(doc_access_list) > 0 + print(f"Found {len(doc_access_list)} documents with permissions") + # create new connector with patch( "ee.onyx.external_permissions.google_drive.group_sync.GoogleDriveConnector", return_value=_build_connector(google_drive_service_acct_connector_factory), ): - external_user_groups = gdrive_group_sync("test_tenant", mock_cc_pair) - - # Verify we got some results - assert len(doc_access_list) > 0 - print(f"Found {len(doc_access_list)} documents with permissions") + external_user_group_generator = gdrive_group_sync("test_tenant", mock_cc_pair) + external_user_groups = list(external_user_group_generator) # map group ids to emails group_id_to_email_mapping: dict[str, set[str]] = defaultdict(set) From d40ee294054ebd2b4da9a417a74fdfad8e75d32f Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Fri, 27 Jun 2025 16:33:20 -0700 Subject: [PATCH 5/5] Protect against incorrect truthiness --- backend/onyx/connectors/teams/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/onyx/connectors/teams/utils.py b/backend/onyx/connectors/teams/utils.py index f0392451063..92ec179ce3a 100644 --- a/backend/onyx/connectors/teams/utils.py +++ b/backend/onyx/connectors/teams/utils.py @@ -188,7 +188,7 @@ def fetch_external_access( expert_infos = ( expert_infos - if expert_infos + if expert_infos is not None else fetch_expert_infos(graph_client=graph_client, channel=channel) ) emails = {expert_info.email for expert_info in expert_infos if expert_info.email}