From 26e33229bb2352dedfe65b3f4a8ed5bda31f44e4 Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 17:05:50 -0700 Subject: [PATCH 1/7] Pull in more fields for Jira --- .../onyx/connectors/onyx_jira/connector.py | 44 ++++++++---- backend/onyx/connectors/onyx_jira/utils.py | 4 +- .../daily/connectors/jira/test_jira_basic.py | 72 +++++++++++++++---- 3 files changed, 91 insertions(+), 29 deletions(-) diff --git a/backend/onyx/connectors/onyx_jira/connector.py b/backend/onyx/connectors/onyx_jira/connector.py index 99cde93e866..3e29d3acd4f 100644 --- a/backend/onyx/connectors/onyx_jira/connector.py +++ b/backend/onyx/connectors/onyx_jira/connector.py @@ -44,6 +44,18 @@ _JIRA_SLIM_PAGE_SIZE = 500 _JIRA_FULL_PAGE_SIZE = 50 +# Constants for Jira field names +_FIELD_REPORTER = "reporter" +_FIELD_ASSIGNEE = "assignee" +_FIELD_PRIORITY = "priority" +_FIELD_STATUS = "status" +_FIELD_RESOLUTION = "resolution" +_FIELD_LABELS = "labels" +_FIELD_KEY = "key" +_FIELD_CREATED = "created" +_FIELD_DUEDATE = "duedate" +_FIELD_ISSUETYPE = "issuetype" + def _perform_jql_search( jira_client: JIRA, @@ -107,32 +119,40 @@ def process_jira_issue( page_url = build_jira_url(jira_client, issue.key) + metadata_dict: dict[str, str | list[str]] = {} people = set() try: - creator = best_effort_get_field_from_issue(issue, "creator") + creator = best_effort_get_field_from_issue(issue, _FIELD_REPORTER) if basic_expert_info := best_effort_basic_expert_info(creator): people.add(basic_expert_info) + metadata_dict[_FIELD_REPORTER] = basic_expert_info.get_semantic_name() except Exception: # Author should exist but if not, doesn't matter pass try: - assignee = best_effort_get_field_from_issue(issue, "assignee") + assignee = best_effort_get_field_from_issue(issue, _FIELD_ASSIGNEE) if basic_expert_info := best_effort_basic_expert_info(assignee): people.add(basic_expert_info) + metadata_dict[_FIELD_ASSIGNEE] = basic_expert_info.get_semantic_name() except Exception: # Author should exist but if not, doesn't matter pass - metadata_dict = {} - if priority := best_effort_get_field_from_issue(issue, "priority"): - metadata_dict["priority"] = priority.name - if status := best_effort_get_field_from_issue(issue, "status"): - metadata_dict["status"] = status.name - if resolution := best_effort_get_field_from_issue(issue, "resolution"): - metadata_dict["resolution"] = resolution.name - if labels := best_effort_get_field_from_issue(issue, "labels"): - metadata_dict["labels"] = labels + if priority := best_effort_get_field_from_issue(issue, _FIELD_PRIORITY): + metadata_dict[_FIELD_PRIORITY] = priority.name + if status := best_effort_get_field_from_issue(issue, _FIELD_STATUS): + metadata_dict[_FIELD_STATUS] = status.name + if resolution := best_effort_get_field_from_issue(issue, _FIELD_RESOLUTION): + metadata_dict[_FIELD_RESOLUTION] = resolution.name + if labels := best_effort_get_field_from_issue(issue, _FIELD_LABELS): + metadata_dict[_FIELD_LABELS] = labels + if created := best_effort_get_field_from_issue(issue, _FIELD_CREATED): + metadata_dict[_FIELD_CREATED] = created + if duedate := best_effort_get_field_from_issue(issue, _FIELD_DUEDATE): + metadata_dict[_FIELD_DUEDATE] = duedate + if issuetype := best_effort_get_field_from_issue(issue, _FIELD_ISSUETYPE): + metadata_dict[_FIELD_ISSUETYPE] = issuetype.name return Document( id=page_url, @@ -277,7 +297,7 @@ def retrieve_all_slim_documents( max_results=_JIRA_SLIM_PAGE_SIZE, fields="key", ): - issue_key = best_effort_get_field_from_issue(issue, "key") + issue_key = best_effort_get_field_from_issue(issue, _FIELD_KEY) id = build_jira_url(self.jira_client, issue_key) slim_doc_batch.append( SlimDocument( diff --git a/backend/onyx/connectors/onyx_jira/utils.py b/backend/onyx/connectors/onyx_jira/utils.py index b5ec5f0a01d..96fb9276ee9 100644 --- a/backend/onyx/connectors/onyx_jira/utils.py +++ b/backend/onyx/connectors/onyx_jira/utils.py @@ -23,8 +23,8 @@ def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None: display_name = None email = None - if hasattr(obj, "display_name"): - display_name = obj.display_name + if hasattr(obj, "displayName"): + display_name = obj.displayName else: display_name = obj.get("displayName") diff --git a/backend/tests/daily/connectors/jira/test_jira_basic.py b/backend/tests/daily/connectors/jira/test_jira_basic.py index 885d4f2cacf..d43eb06ef73 100644 --- a/backend/tests/daily/connectors/jira/test_jira_basic.py +++ b/backend/tests/daily/connectors/jira/test_jira_basic.py @@ -4,6 +4,7 @@ import pytest from onyx.configs.constants import DocumentSource +from onyx.connectors.models import Document from onyx.connectors.onyx_jira.connector import JiraConnector from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector @@ -30,19 +31,60 @@ def test_jira_connector_basic(jira_connector: JiraConnector) -> None: start=0, end=time.time(), ) - assert len(docs) == 1 - doc = docs[0] - - assert doc.id == "https://danswerai.atlassian.net/browse/AS-2" - assert doc.semantic_identifier == "AS-2: test123small" - assert doc.source == DocumentSource.JIRA - assert doc.metadata == {"priority": "Medium", "status": "Backlog"} - assert doc.secondary_owners is None - assert doc.title == "AS-2 test123small" - assert doc.from_ingestion_api is False - assert doc.additional_info is None - - assert len(doc.sections) == 1 - section = doc.sections[0] + assert len(docs) == 2 + + # Find story and epic + story: Document | None = None + epic: Document | None = None + for doc in docs: + if doc.metadata["issuetype"] == "Story": + story = doc + elif doc.metadata["issuetype"] == "Epic": + epic = doc + + assert story is not None + assert epic is not None + + # Check task + assert story.id == "https://danswerai.atlassian.net/browse/AS-3" + assert story.semantic_identifier == "AS-3: test123small" + assert story.source == DocumentSource.JIRA + assert story.metadata == { + "priority": "Medium", + "status": "Backlog", + "reporter": "Chris Weaver", + "assignee": "Chris Weaver", + "issuetype": "Story", + "created": "2025-04-16T16:44:06.716-0700", + } + assert story.secondary_owners is None + assert story.title == "AS-3 test123small" + assert story.from_ingestion_api is False + assert story.additional_info is None + + assert len(story.sections) == 1 + section = story.sections[0] + assert section.text == "example_text\n" + assert section.link == "https://danswerai.atlassian.net/browse/AS-3" + + # Check epic + assert epic.id == "https://danswerai.atlassian.net/browse/AS-4" + assert epic.semantic_identifier == "AS-4: EPIC" + assert epic.source == DocumentSource.JIRA + assert epic.metadata == { + "priority": "Medium", + "status": "Backlog", + "reporter": "Founder Onyx", + "assignee": "Chris Weaver", + "issuetype": "Epic", + "created": "2025-04-16T16:55:53.068-0700", + } + assert epic.secondary_owners is None + assert epic.title == "AS-4 EPIC" + assert epic.from_ingestion_api is False + assert epic.additional_info is None + + assert len(epic.sections) == 1 + section = epic.sections[0] assert section.text == "example_text\n" - assert section.link == "https://danswerai.atlassian.net/browse/AS-2" + assert section.link == "https://danswerai.atlassian.net/browse/AS-4" From d1b93c4003c55710d011528a72ca955e397ada0d Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 17:15:17 -0700 Subject: [PATCH 2/7] Fix tests --- backend/tests/daily/connectors/jira/test_jira_basic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/tests/daily/connectors/jira/test_jira_basic.py b/backend/tests/daily/connectors/jira/test_jira_basic.py index d43eb06ef73..ad93f05752a 100644 --- a/backend/tests/daily/connectors/jira/test_jira_basic.py +++ b/backend/tests/daily/connectors/jira/test_jira_basic.py @@ -1,5 +1,6 @@ import os import time +from unittest.mock import patch import pytest @@ -25,6 +26,10 @@ def jira_connector() -> JiraConnector: return connector +@patch( + "onyx.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) def test_jira_connector_basic(jira_connector: JiraConnector) -> None: docs = load_all_docs_from_checkpoint_connector( connector=jira_connector, From ba0353d82b3660dd6e2e5fb777288af19bd6b2da Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 17:16:09 -0700 Subject: [PATCH 3/7] Fix --- .../confluence/test_confluence_permissions_basic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py index 9b0c15b0a6e..f723324ccf5 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py @@ -34,6 +34,10 @@ def confluence_connector() -> ConfluenceConnector: # This should never fail because even if the docs in the cloud change, # the full doc ids retrieved should always be a subset of the slim doc ids +@patch( + "onyx.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) def test_confluence_connector_permissions( confluence_connector: ConfluenceConnector, ) -> None: From 92396fbe841d93190806096d0cd4e9eb3431fe8a Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 17:29:46 -0700 Subject: [PATCH 4/7] more fix --- .../confluence/test_confluence_permissions_basic.py | 1 + backend/tests/daily/connectors/jira/test_jira_basic.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py index f723324ccf5..9328b2bb708 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py @@ -39,6 +39,7 @@ def confluence_connector() -> ConfluenceConnector: return_value=None, ) def test_confluence_connector_permissions( + mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector, ) -> None: # Get all doc IDs from the full connector diff --git a/backend/tests/daily/connectors/jira/test_jira_basic.py b/backend/tests/daily/connectors/jira/test_jira_basic.py index ad93f05752a..f4cb25a7948 100644 --- a/backend/tests/daily/connectors/jira/test_jira_basic.py +++ b/backend/tests/daily/connectors/jira/test_jira_basic.py @@ -1,5 +1,6 @@ import os import time +from unittest.mock import MagicMock from unittest.mock import patch import pytest @@ -30,7 +31,9 @@ def jira_connector() -> JiraConnector: "onyx.file_processing.extract_file_text.get_unstructured_api_key", return_value=None, ) -def test_jira_connector_basic(jira_connector: JiraConnector) -> None: +def test_jira_connector_basic( + mock_get_api_key: MagicMock, jira_connector: JiraConnector +) -> None: docs = load_all_docs_from_checkpoint_connector( connector=jira_connector, start=0, From fa8cc1cebba9406fff47165af20687cab909a6e0 Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 17:42:17 -0700 Subject: [PATCH 5/7] Fix --- .../connectors/confluence/test_confluence_permissions_basic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py index 9328b2bb708..2338483fc58 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py @@ -81,6 +81,8 @@ def test_confluence_connector_restriction_handling( "confluence_username": os.environ["CONFLUENCE_USER_NAME"], "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"], } + # this prevents redis calls inside of OnyxConfluence + mock_provider_instance.is_dynamic.return_value = False # Make the class return our configured instance when called mock_db_provider_class.return_value = mock_provider_instance From 5ab7587f1d879369614809d814abaea54d57b1c8 Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 17:58:58 -0700 Subject: [PATCH 6/7] Fix S3 test --- .../tests/daily/connectors/blob/test_blob_connector.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/backend/tests/daily/connectors/blob/test_blob_connector.py b/backend/tests/daily/connectors/blob/test_blob_connector.py index b2819a2d8c3..5822d9ec38e 100644 --- a/backend/tests/daily/connectors/blob/test_blob_connector.py +++ b/backend/tests/daily/connectors/blob/test_blob_connector.py @@ -9,7 +9,6 @@ from onyx.connectors.models import Document from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS -from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS from onyx.file_processing.extract_file_text import get_file_ext @@ -42,7 +41,7 @@ def test_blob_s3_connector( """ Plain and document file types should be fully indexed. - Multimedia and unknown file types will be indexed by title only with one empty section. + Multimedia and unknown file types will be indexed be skipped. This is intentional in order to allow searching by just the title even if we can't index the file content. @@ -53,8 +52,7 @@ def test_blob_s3_connector( for doc in doc_batch: all_docs.append(doc) - # - assert len(all_docs) == 19 + assert len(all_docs) == 15 for doc in all_docs: section = doc.sections[0] @@ -69,9 +67,5 @@ def test_blob_s3_connector( assert len(section.text) > 0 continue - if file_extension in ACCEPTED_IMAGE_FILE_EXTENSIONS: - assert len(section.text) == 0 - continue - # unknown extension assert len(section.text) == 0 From c884fb403a3d1e4c9835d802a0716454dbe30c19 Mon Sep 17 00:00:00 2001 From: Weves Date: Wed, 16 Apr 2025 18:00:21 -0700 Subject: [PATCH 7/7] fix --- backend/tests/daily/connectors/blob/test_blob_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/tests/daily/connectors/blob/test_blob_connector.py b/backend/tests/daily/connectors/blob/test_blob_connector.py index 5822d9ec38e..a699c55560b 100644 --- a/backend/tests/daily/connectors/blob/test_blob_connector.py +++ b/backend/tests/daily/connectors/blob/test_blob_connector.py @@ -41,7 +41,8 @@ def test_blob_s3_connector( """ Plain and document file types should be fully indexed. - Multimedia and unknown file types will be indexed be skipped. + Multimedia and unknown file types will be indexed be skipped unless `set_allow_images` + is called with `True`. This is intentional in order to allow searching by just the title even if we can't index the file content.