From ebfaecd366020367196af09baf94da01ab63dcaa Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Wed, 2 Apr 2025 20:55:20 -0700 Subject: [PATCH 1/7] WIP --- backend/onyx/chat/process_message.py | 12 +- .../google_drive/section_extraction.py | 150 +++++++++++------- 2 files changed, 101 insertions(+), 61 deletions(-) diff --git a/backend/onyx/chat/process_message.py b/backend/onyx/chat/process_message.py index eea541aca7..cea03aeba0 100644 --- a/backend/onyx/chat/process_message.py +++ b/backend/onyx/chat/process_message.py @@ -1089,14 +1089,10 @@ def stream_chat_message_objects( selected_search_docs=selected_db_search_docs, # Deduping happens at the last step to avoid harming quality by dropping content early on # Skip deduping completely for ordering-only mode to save time - dedupe_docs=( - False - if search_for_ordering_only - else ( - retrieval_options.dedupe_docs - if retrieval_options - else False - ) + dedupe_docs=bool( + not search_for_ordering_only + and retrieval_options + and retrieval_options.dedupe_docs ), user_files=user_file_files if search_for_ordering_only else [], loaded_user_files=user_files diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py index 0c63eba5c6..e571302fbc 100644 --- a/backend/onyx/connectors/google_drive/section_extraction.py +++ b/backend/onyx/connectors/google_drive/section_extraction.py @@ -7,17 +7,16 @@ class CurrentHeading(BaseModel): - id: str + id: str | None text: str -def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str: +def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str: """Builds a Google Doc link that jumps to a specific heading""" # NOTE: doesn't support docs with multiple tabs atm, if we need that ask # @Chris - return ( - f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}" - ) + heading_str = f"#heading={heading_id}" if heading_id else "" + return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}" def _extract_id_from_heading(paragraph: dict[str, Any]) -> str: @@ -31,75 +30,120 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str: for element in paragraph.get("elements", []): if "textRun" in element: text_elements.append(element["textRun"].get("content", "")) + + # Handle links + if "textStyle" in element and "link" in element["textStyle"]: + text_elements.append(f"[{element['textStyle']['link'].get('url', '')}]") return "".join(text_elements) +def _extract_text_from_table(table: dict[str, Any]) -> str: + """Extracts the text content from a table element""" + row_strs = [] + for row in table.get("tableRows", []): + cells = row.get("tableCells", []) + cell_strs = [ + _extract_text_from_paragraph(cell.get("content", {})) for cell in cells + ] + row_strs.append(", ".join(cell_strs)) + return "\n".join(row_strs) + + def get_document_sections( docs_service: GoogleDocsService, doc_id: str, ) -> list[TextSection]: """Extracts sections from a Google Doc, including their headings and content""" # Fetch the document structure - doc = docs_service.documents().get(documentId=doc_id).execute() + doc = ( + docs_service.documents() + .get(documentId=doc_id, includeTabsContent=True) + .execute() + ) # Get the content - content = doc.get("body", {}).get("content", []) + tabs = doc.get("tabs", {}) + sections: list[TextSection] = [] + for tab in tabs: + sections.extend(get_tab_sections(tab, doc_id)) + return sections + + +def _is_heading(paragraph: dict[str, Any]) -> bool: + """Checks if a paragraph (a block of text in a drive document) is a heading""" + if not ( + "paragraphStyle" in paragraph + and "namedStyleType" in paragraph["paragraphStyle"] + ): + return False + + style = paragraph["paragraphStyle"]["namedStyleType"] + is_heading = style.startswith("HEADING_") + is_title = style.startswith("TITLE") + return is_heading or is_title + + +def _add_finished_section( + sections: list[TextSection], + doc_id: str, + tab_id: str, + current_heading: CurrentHeading, + current_section: list[str], +) -> None: + """Adds a finished section to the list of sections if the section has content. + Returns the list of sections to use going forward, which may be the old list + if a new section was not added. + """ + if not (current_section or current_heading.text): + return + # If we were building a previous section, add it to sections list + section_text = f"{current_heading.text}\n" + "\n".join(current_section) + sections.append( + TextSection( + text=section_text.strip(), + link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id), + ) + ) + + +def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]: + tab_id = tab["tabProperties"]["tabId"] + content = tab.get("documentTab", {}).get("body", {}).get("content", []) sections: list[TextSection] = [] current_section: list[str] = [] - current_heading: CurrentHeading | None = None + current_heading = CurrentHeading(id=None, text="") for element in content: - if "paragraph" not in element: - continue - - paragraph = element["paragraph"] - - # Check if this is a heading - if ( - "paragraphStyle" in paragraph - and "namedStyleType" in paragraph["paragraphStyle"] - ): - style = paragraph["paragraphStyle"]["namedStyleType"] - is_heading = style.startswith("HEADING_") - is_title = style.startswith("TITLE") - - if is_heading or is_title: - # If we were building a previous section, add it to sections list - if current_heading is not None and current_section: - heading_text = current_heading.text - section_text = f"{heading_text}\n" + "\n".join(current_section) - sections.append( - TextSection( - text=section_text.strip(), - link=_build_gdoc_section_link(doc_id, current_heading.id), - ) - ) - current_section = [] - - # Start new heading - heading_id = _extract_id_from_heading(paragraph) - heading_text = _extract_text_from_paragraph(paragraph) - current_heading = CurrentHeading( - id=heading_id, - text=heading_text, - ) + if "paragraph" in element: + paragraph = element["paragraph"] + + # If this is not a heading, add content to current section + if not _is_heading(paragraph): + text = _extract_text_from_paragraph(paragraph) + if text.strip(): + current_section.append(text) continue - # Add content to current section - if current_heading is not None: - text = _extract_text_from_paragraph(paragraph) + _add_finished_section( + sections, doc_id, tab_id, current_heading, current_section + ) + + current_section = [] + + # Start new heading + heading_id = _extract_id_from_heading(paragraph) + heading_text = _extract_text_from_paragraph(paragraph) + current_heading = CurrentHeading( + id=heading_id, + text=heading_text, + ) + elif "table" in element: + text = _extract_text_from_table(element["table"]) if text.strip(): current_section.append(text) # Don't forget to add the last section - if current_heading is not None and current_section: - section_text = f"{current_heading.text}\n" + "\n".join(current_section) - sections.append( - TextSection( - text=section_text.strip(), - link=_build_gdoc_section_link(doc_id, current_heading.id), - ) - ) + _add_finished_section(sections, doc_id, tab_id, current_heading, current_section) return sections From a04132fdc83dae940991a21d8f90ecd153532618 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 4 Apr 2025 12:49:45 -0700 Subject: [PATCH 2/7] WIP almost done, but realized we can just do basic retrieval --- backend/onyx/configs/app_configs.py | 2 + .../onyx/connectors/google_drive/connector.py | 115 +++++++++++++ .../connectors/google_drive/doc_conversion.py | 8 +- .../google_drive/section_extraction.py | 151 +++++++++++++++--- .../connectors/google_utils/google_utils.py | 48 ++++++ .../onyx/connectors/google_utils/resources.py | 11 ++ .../google_utils/shared_constants.py | 14 ++ 7 files changed, 328 insertions(+), 21 deletions(-) diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py index a293df6248..f45808f937 100644 --- a/backend/onyx/configs/app_configs.py +++ b/backend/onyx/configs/app_configs.py @@ -163,6 +163,8 @@ MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4)) +USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true" + # Below are intended to match the env variables names used by the official postgres docker image # https://hub.docker.com/_/postgres POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres" diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index f9764535b2..551a10a73d 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -18,6 +18,7 @@ from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD from onyx.configs.app_configs import INDEX_BATCH_SIZE from onyx.configs.app_configs import MAX_DRIVE_WORKERS +from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES from onyx.configs.constants import DocumentSource from onyx.connectors.exceptions import ConnectorValidationError from onyx.connectors.exceptions import CredentialExpiredError @@ -39,12 +40,16 @@ from onyx.connectors.google_drive.models import RetrievedDriveFile from onyx.connectors.google_drive.models import StageCompletion from onyx.connectors.google_utils.google_auth import get_google_creds +from onyx.connectors.google_utils.google_utils import create_scripts_file_objects from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval from onyx.connectors.google_utils.google_utils import GoogleFields +from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME from onyx.connectors.google_utils.resources import get_admin_service from onyx.connectors.google_utils.resources import get_drive_service from onyx.connectors.google_utils.resources import get_google_docs_service +from onyx.connectors.google_utils.resources import get_google_scripts_service from onyx.connectors.google_utils.resources import GoogleDriveService +from onyx.connectors.google_utils.resources import GoogleScriptsService from onyx.connectors.google_utils.shared_constants import ( DB_CREDENTIALS_PRIMARY_ADMIN_KEY, ) @@ -90,6 +95,7 @@ def _convert_single_file( creds: Any, allow_images: bool, size_threshold: int, + smart_chips_deployment_id: str, retriever_email: str, file: dict[str, Any], ) -> Document | ConnectorFailure | None: @@ -107,10 +113,15 @@ def _convert_single_file( docs_service = lazy_eval( lambda: get_google_docs_service(creds, user_email=user_email) ) + scripts_service = lazy_eval( + lambda: get_google_scripts_service(creds, user_email=user_email) + ) return convert_drive_item_to_document( file=file, drive_service=user_drive_service, docs_service=docs_service, + scripts_service=scripts_service, + smart_chips_deployment_id=smart_chips_deployment_id, allow_images=allow_images, size_threshold=size_threshold, ) @@ -176,6 +187,7 @@ def __init__( my_drive_emails: str | None = None, shared_folder_urls: str | None = None, batch_size: int = INDEX_BATCH_SIZE, + smart_chip_deployment_id: str = "", # OLD PARAMETERS folder_paths: list[str] | None = None, include_shared: bool | None = None, @@ -248,6 +260,8 @@ def __init__( self._retrieved_ids: set[str] = set() self.allow_images = False + self.smart_chip_deployment_id = smart_chip_deployment_id + self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD def set_allow_images(self, value: bool) -> None: @@ -295,8 +309,108 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None source=DocumentSource.GOOGLE_DRIVE, ) + if USE_SMART_CHIP_SCOPES: + self.upsert_smart_chip_app_script() + return new_creds_dict + @staticmethod + def _get_latest_deployment( + scripts_service: GoogleScriptsService, script_id: str + ) -> dict[str, Any]: + deployments = ( + scripts_service.projects() + .deployments() + .list( + scriptId=script_id, + ) + .execute() + ) + all_deployments = deployments.get("deployments", []) + while "nextPageToken" in deployments: + deployments = ( + scripts_service.projects() + .deployments() + .list( + scriptId=script_id, + pageToken=deployments["nextPageToken"], + ) + .execute() + ) + all_deployments.extend(deployments.get("deployments", [])) + + if len(all_deployments) == 0: + raise RuntimeError(f"No deployments found for script {script_id}") + return max( + all_deployments, + key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(), + ) + + def upsert_smart_chip_app_script(self) -> None: + assert self._creds is not None, "creds not set" + + # If a deployment id is provided, we don't need to create a new script. + # The deployment id can be retrieved by going under + # Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com) + if self.smart_chip_deployment_id: + return + + # Step 1: Check if the script already exists by searching the admin drive. + drive_service = get_drive_service( + self._creds, user_email=self.primary_admin_email + ) + q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false" + script_search = ( + drive_service.files() + .list( + corpora="user", + fields="files(mimeType, id, name)", + q=q, + ) + .execute() + ) + script_id = (script_search.get("files") or [{}])[0].get("id") + scripts_service = get_google_scripts_service( + self._creds, user_email=self.primary_admin_email + ) + if not script_id: + # Step 2: Create the script if nonexistent + # (Takes about ~10 seconds) + req = scripts_service.projects().create( + body={"title": SMART_CHIP_SCRIPT_FILE_NAME} + ) + response = req.execute() + + if "scriptId" not in response: + raise RuntimeError( + f"Failed to create Smart Chip App Script: {response}" + ) + + script_id = response["scriptId"] + scripts_files = create_scripts_file_objects() + # Step 3: Update (upload) the script content + response = ( + scripts_service.projects() + .updateContent(scriptId=script_id, body={"files": scripts_files}) + .execute() + ) + + if "scriptId" not in response: + raise RuntimeError( + f"Failed to update Smart Chip App Script: {response}" + ) + + script_id = response["scriptId"] + + # Step 4: Get the deployment id + self.smart_chip_deployment_id = self._get_latest_deployment( + scripts_service, script_id + )["deploymentId"] + + # TODO: upsert new version if out of date. We don't expect to do this often. + # One way would be to check whether the script files have changed (either via git + # or actually pulling the current content and comparing). + def _update_traversed_parent_ids(self, folder_id: str) -> None: self._retrieved_ids.add(folder_id) @@ -951,6 +1065,7 @@ def _extract_docs_from_google_drive( self.creds, self.allow_images, self.size_threshold, + self.smart_chip_deployment_id, ) # Fetch files in batches batches_complete = 0 diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index ce3800e59e..cd4ff05914 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -14,6 +14,7 @@ from onyx.connectors.google_drive.section_extraction import get_document_sections from onyx.connectors.google_utils.resources import GoogleDocsService from onyx.connectors.google_utils.resources import GoogleDriveService +from onyx.connectors.google_utils.resources import GoogleScriptsService from onyx.connectors.models import ConnectorFailure from onyx.connectors.models import Document from onyx.connectors.models import DocumentFailure @@ -224,6 +225,8 @@ def convert_drive_item_to_document( file: GoogleDriveFileType, drive_service: Callable[[], GoogleDriveService], docs_service: Callable[[], GoogleDocsService], + scripts_service: Callable[[], GoogleScriptsService], + smart_chips_deployment_id: str, allow_images: bool, size_threshold: int, ) -> Document | ConnectorFailure | None: @@ -244,7 +247,10 @@ def convert_drive_item_to_document( try: # get_document_sections is the advanced approach for Google Docs doc_sections = get_document_sections( - docs_service=docs_service(), doc_id=file.get("id", "") + docs_service=docs_service(), + scripts_service=scripts_service(), + smart_chips_deployment_id=smart_chips_deployment_id, + doc_id=file.get("id", ""), ) if doc_sections: sections = cast(list[TextSection | ImageSection], doc_sections) diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py index e571302fbc..2d78e31276 100644 --- a/backend/onyx/connectors/google_drive/section_extraction.py +++ b/backend/onyx/connectors/google_drive/section_extraction.py @@ -1,11 +1,18 @@ +from collections.abc import Callable from typing import Any from pydantic import BaseModel +from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES from onyx.connectors.google_utils.resources import GoogleDocsService +from onyx.connectors.google_utils.resources import GoogleScriptsService from onyx.connectors.models import TextSection +DRIVE_CHIP_CHAR = "\ue907" +UNKNOWN_SMART_CHIP_STR = "" + + class CurrentHeading(BaseModel): id: str | None text: str @@ -24,7 +31,9 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str: return paragraph["paragraphStyle"]["headingId"] -def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str: +def _extract_text_from_paragraph( + paragraph: dict[str, Any], extract_chip: Callable[[int], str | None] +) -> tuple[str, int]: """Extracts the text content from a paragraph element""" text_elements = [] for element in paragraph.get("elements", []): @@ -33,39 +42,135 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str: # Handle links if "textStyle" in element and "link" in element["textStyle"]: - text_elements.append(f"[{element['textStyle']['link'].get('url', '')}]") - return "".join(text_elements) + text_elements.append(f"({element['textStyle']['link'].get('url', '')})") + + if "person" in element: + name = element["person"].get("personProperties", {}).get("name", "") + email = element["person"].get("personProperties", {}).get("email", "") + person_str = " dict[str, str]: + """Extracts smart chips from a Google Doc. Returns a dictionary where + the keys are the smart chip location keys and the values are the smart chip text. + + This calls a Google Apps Script function, because most smart chips are not currently + available through the API https://issuetracker.google.com/issues/225584757 + + Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum". + nonTextIndexNum refers to the index at which the value was found while traversing + the paragraph or table cell from left to right, top to bottom. + + There are many non-text elements that are currently not supported by Apps Script, (see + https://developers.google.com/apps-script/reference/document/element-type ), so some + non-text elements won't have an associated text value. + """ + + # NOTE: the documentation is incorrect; the script id you must specify is + # actually the deployment id (what comes up when you go to Deploy-> Test Deployments) + http_request = scripts_service.scripts().run( + scriptId=deployment_id, + body={ + "function": "docToChips", + "parameters": [document_id], + # "devMode": True + }, + ) + doc = http_request.execute() + return doc.get("response", {}).get("result", {}) -def _extract_text_from_table(table: dict[str, Any]) -> str: - """Extracts the text content from a table element""" +def _extract_text_from_table( + table: dict[str, Any], extract_chip: Callable[[int], str | None] +) -> str: + """ + Extracts the text content from a table element. + Smart chip extraction will be wrong for nested tables. + """ row_strs = [] + seen_non_text = 0 + + def table_extract_chip(non_text_index: int) -> str | None: + return extract_chip(non_text_index + seen_non_text) + for row in table.get("tableRows", []): cells = row.get("tableCells", []) - cell_strs = [ - _extract_text_from_paragraph(cell.get("content", {})) for cell in cells - ] + cell_strs = [] + for cell in cells: + child_elements = cell.get("content", {}) + cell_str = [] + for child_elem in child_elements: + if "paragraph" not in child_elem: + continue + text, num_non_text_elements = _extract_text_from_paragraph( + child_elem["paragraph"], table_extract_chip + ) + cell_str.append(text) + seen_non_text += num_non_text_elements + + cell_strs.append("".join(cell_str)) row_strs.append(", ".join(cell_strs)) return "\n".join(row_strs) def get_document_sections( docs_service: GoogleDocsService, + scripts_service: GoogleScriptsService, + smart_chips_deployment_id: str, doc_id: str, ) -> list[TextSection]: """Extracts sections from a Google Doc, including their headings and content""" # Fetch the document structure - doc = ( - docs_service.documents() - .get(documentId=doc_id, includeTabsContent=True) - .execute() - ) + http_request = docs_service.documents().get(documentId=doc_id) + + # Google has poor support for tabs in the docs api, see + # https://cloud.google.com/python/docs/reference/cloudtasks/ + # latest/google.cloud.tasks_v2.types.HttpRequest + # https://developers.google.com/workspace/docs/api/how-tos/tabs + # https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get + # this is a hack to use the param mentioned in the rest api docs + # TODO: check if it can be specified i.e. in documents() + http_request.uri += "&includeTabsContent=true" + doc = http_request.execute() + + smart_chips = {} + if USE_SMART_CHIP_SCOPES: + # Get the smart chips + smart_chips = _extract_smart_chips_from_document( + doc_id, scripts_service, smart_chips_deployment_id + ) # Get the content tabs = doc.get("tabs", {}) sections: list[TextSection] = [] - for tab in tabs: - sections.extend(get_tab_sections(tab, doc_id)) + for tab_num, tab in enumerate(tabs): + sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips)) return sections @@ -106,7 +211,9 @@ def _add_finished_section( ) -def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]: +def get_tab_sections( + tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str] +) -> list[TextSection]: tab_id = tab["tabProperties"]["tabId"] content = tab.get("documentTab", {}).get("body", {}).get("content", []) @@ -114,13 +221,17 @@ def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]: current_section: list[str] = [] current_heading = CurrentHeading(id=None, text="") - for element in content: + for element_num, element in enumerate(content): + + def extract_chip(non_text_index: int) -> str | None: + return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}") + if "paragraph" in element: paragraph = element["paragraph"] # If this is not a heading, add content to current section if not _is_heading(paragraph): - text = _extract_text_from_paragraph(paragraph) + text, _ = _extract_text_from_paragraph(paragraph, extract_chip) if text.strip(): current_section.append(text) continue @@ -133,13 +244,13 @@ def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]: # Start new heading heading_id = _extract_id_from_heading(paragraph) - heading_text = _extract_text_from_paragraph(paragraph) + heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip) current_heading = CurrentHeading( id=heading_id, text=heading_text, ) elif "table" in element: - text = _extract_text_from_table(element["table"]) + text = _extract_text_from_table(element["table"], extract_chip) if text.strip(): current_section.append(text) diff --git a/backend/onyx/connectors/google_utils/google_utils.py b/backend/onyx/connectors/google_utils/google_utils.py index 4ad6cbe7af..51ec2181a3 100644 --- a/backend/onyx/connectors/google_utils/google_utils.py +++ b/backend/onyx/connectors/google_utils/google_utils.py @@ -1,3 +1,4 @@ +import json import re import time from collections.abc import Callable @@ -141,3 +142,50 @@ def execute_paginated_retrieval( yield item else: yield results + + +# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType +class AppsScriptFileType(str, Enum): + UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED" + SERVER_JS = "SERVER_JS" + HTML = "HTML" + JSON = "JSON" + + +SMART_CHIP_RETRIEVAL_FUNCTIONS = [ + ("docToChips", ["document_id"]), + ("getKey", ["tabInd", "paragraphInd", "nonTextInd"]), + ("parseParagraph", ["paragraph", "callback"]), + ("parseTable", ["table", "callback"]), +] + +SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor" + + +# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent +def create_scripts_file_objects() -> list[GoogleDriveFileType]: + with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f: + script_source = f.read() + with open("onyx/connectors/google_drive/appsscript.json", "r") as f: + appsscript_source = json.loads(f.read()) + return [ + { + "name": "appsscript", + "type": AppsScriptFileType.JSON.value, + "source": json.dumps(appsscript_source), + }, + { + "name": SMART_CHIP_SCRIPT_FILE_NAME, + "type": AppsScriptFileType.SERVER_JS.value, + "source": script_source, + "functionSet": { + "values": [ + { + "name": name, + "parameters": params, + } + for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS + ], + }, + }, + ] diff --git a/backend/onyx/connectors/google_utils/resources.py b/backend/onyx/connectors/google_utils/resources.py index 48bd981c2b..a12f30cf67 100644 --- a/backend/onyx/connectors/google_utils/resources.py +++ b/backend/onyx/connectors/google_utils/resources.py @@ -12,6 +12,10 @@ class GoogleDocsService(Resource): pass +class GoogleScriptsService(Resource): + pass + + class AdminService(Resource): pass @@ -62,3 +66,10 @@ def get_gmail_service( user_email: str | None = None, ) -> GmailService: return _get_google_service("gmail", "v1", creds, user_email) + + +def get_google_scripts_service( + creds: ServiceAccountCredentials | OAuthCredentials, + user_email: str | None = None, +) -> GoogleScriptsService: + return _get_google_service("script", "v1", creds, user_email) diff --git a/backend/onyx/connectors/google_utils/shared_constants.py b/backend/onyx/connectors/google_utils/shared_constants.py index bd5ebb11ec..483ea6dd35 100644 --- a/backend/onyx/connectors/google_utils/shared_constants.py +++ b/backend/onyx/connectors/google_utils/shared_constants.py @@ -1,5 +1,6 @@ from enum import Enum as PyEnum +from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES from onyx.configs.constants import DocumentSource # NOTE: do not need https://www.googleapis.com/auth/documents.readonly @@ -18,6 +19,19 @@ ], } +# TODO: add this to the docs +GOOGLE_SMART_CHIP_SCOPES = [ + "https://www.googleapis.com/auth/script.external_request", + "https://www.googleapis.com/auth/drive.scripts", + "https://www.googleapis.com/auth/script.scriptapp", + "https://www.googleapis.com/auth/script.deployments", + "https://www.googleapis.com/auth/script.projects", + "https://www.googleapis.com/auth/documents", +] + +if USE_SMART_CHIP_SCOPES: + GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES + # This is the Oauth token DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens" # This is the service account key From d31e1407f2843c4381a14d510a76a4f02ca1475e Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 4 Apr 2025 16:53:30 -0700 Subject: [PATCH 3/7] rebased and added scripts --- .../connectors/google_drive/appsscript.json | 17 +++ .../google_drive/smart_chip_retrieval.gs | 132 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 backend/onyx/connectors/google_drive/appsscript.json create mode 100644 backend/onyx/connectors/google_drive/smart_chip_retrieval.gs diff --git a/backend/onyx/connectors/google_drive/appsscript.json b/backend/onyx/connectors/google_drive/appsscript.json new file mode 100644 index 0000000000..1a9e54269b --- /dev/null +++ b/backend/onyx/connectors/google_drive/appsscript.json @@ -0,0 +1,17 @@ +{ + "timeZone": "America/Los_Angeles", + "dependencies": { + "enabledAdvancedServices": [ + { + "userSymbol": "Docs", + "version": "v1", + "serviceId": "docs" + } + ] + }, + "exceptionLogging": "STACKDRIVER", + "runtimeVersion": "V8", + "executionApi": { + "access": "MYSELF" + } + } \ No newline at end of file diff --git a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs new file mode 100644 index 0000000000..0419adc179 --- /dev/null +++ b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs @@ -0,0 +1,132 @@ + +/** + * Retrieves the given Google doc by id and extracts dates, people, and rich links + * from it into a JSON keyed by tab, paragraph, and non-text-component index. + * + */ +function docToChips(document_id) { + const doc = DocumentApp.openById(document_id); + const tabs = doc.getTabs(); + const ret = new Map(); + tabs.map((tab, tabInd) => { + const docTab = tab.asDocumentTab(); + const body = docTab.getBody(); + for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) { + var tabChild = body.getChild(tabChildInd); + var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;}); + switch (tabChild.getType()) { + case DocumentApp.ElementType.PARAGRAPH: + parseParagraph(tabChild.asParagraph(), callback); + console.log("paragraph", tabChild.asParagraph().getText()); + break; + case DocumentApp.ElementType.TABLE: + console.log("table"); + parseTable(tabChild.asTable(), callback); + break; + case DocumentApp.ElementType.LIST_ITEM: + var listItem = tabChild.asListItem(); + //console.log("list item:", listItem.getText(), listItem.getNumChildren()); + //console.log(listItem.getChild(0).asText().getText()); + parseParagraph(tabChild.asListItem(), callback); + break; + default: + console.log("found unknown tab body child of type: ", tabChild.getType().toString()); + } + } + }); + console.log(ret); + return ret; +} + +// uncomment and paste in a file id (and change the main function to "test") +// to test the docToChips function +// function test() { +// return docToChips("document id goes here"); +// } + +function getKey(tabInd, paragraphInd, nonTextInd) { + return tabInd + "_" + paragraphInd + "_" + nonTextInd; +} + +// also used for list items +function parseParagraph(paragraph, callback) { + var nonTextInd = 0; + for (let i = 0; i < paragraph.getNumChildren(); i++) { // + var child = paragraph.getChild(i); + switch (child.getType()) { + case DocumentApp.ElementType.DATE: + console.log(child.asDate().getDisplayText()); + callback(nonTextInd, child.asDate().getDisplayText()); + break; + case DocumentApp.ElementType.EQUATION: + var eqStr = child.getText(); + console.log("equation: ", eqStr); + callback(nonTextInd, eqStr); + break; + case DocumentApp.ElementType.PERSON: + var personStr = ""; + console.log(personStr); + //callback(nonTextInd, personStr); + nonTextInd--; // Advanced Docs API picks up people + break; + case DocumentApp.ElementType.RICH_LINK: + var richLink = child.asRichLink() + var linkStr = "" + console.log(linkStr); + // callback(nonTextInd, child.asRichLink().getUrl()); + nonTextInd--; // Advanced Docs API picks up rich links + break; + case DocumentApp.ElementType.TEXT: + console.log("text: "+ child.asText().getText()); + //console.log(child.asText().) + nonTextInd--; + break; + case DocumentApp.ElementType.UNSUPPORTED: + console.log("unsupported element type"); + break; + default: + console.log("found special element type:", child.getType().toString()); + } + nonTextInd++; + } +} + +function parseTable(table, callback) { + var lastSeenInCell = 0; + var allSeenElems = 0 + const tableCallback = ((nonTextInd, replaceText) => { + callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText); + lastSeenInCell++; + }); + for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) { + var row = table.getChild(rowInd); + if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) { + console.log("table child type: ", row.getType().toString()); + continue; + } + + for (let colInd = 0; colInd < row.getNumChildren(); colInd++) { + var cell = row.getChild(colInd); + if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) { + console.log("row child type: ", cell.getType().toString()); + continue; + } + + for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) { + var item = cell.getChild(itemInd); + console.log(item.getType().toString()); + switch (item.getType()) { + case DocumentApp.ElementType.PARAGRAPH: + case DocumentApp.ElementType.LIST_ITEM: + parseParagraph(item, tableCallback); + break; + case DocumentApp.ElementType.TABLE: + parseTable(item, tableCallback); + break; + } + } + allSeenElems += lastSeenInCell; + lastSeenInCell = 0; + } + } +} \ No newline at end of file From 7a67e1196acf2aab3dedba6dfb5881d6f214f7f2 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 4 Apr 2025 16:46:05 -0700 Subject: [PATCH 4/7] improved approach to extracting smart chips --- backend/onyx/configs/app_configs.py | 2 - .../onyx/connectors/google_drive/connector.py | 115 ------------------ .../connectors/google_drive/doc_conversion.py | 85 ++++++++++++- .../google_drive/section_extraction.py | 104 +++------------- .../connectors/google_utils/google_utils.py | 48 -------- .../onyx/connectors/google_utils/resources.py | 11 -- .../google_utils/shared_constants.py | 14 --- backend/onyx/connectors/models.py | 2 - 8 files changed, 96 insertions(+), 285 deletions(-) diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py index f45808f937..a293df6248 100644 --- a/backend/onyx/configs/app_configs.py +++ b/backend/onyx/configs/app_configs.py @@ -163,8 +163,6 @@ MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4)) -USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true" - # Below are intended to match the env variables names used by the official postgres docker image # https://hub.docker.com/_/postgres POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres" diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index 551a10a73d..f9764535b2 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -18,7 +18,6 @@ from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD from onyx.configs.app_configs import INDEX_BATCH_SIZE from onyx.configs.app_configs import MAX_DRIVE_WORKERS -from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES from onyx.configs.constants import DocumentSource from onyx.connectors.exceptions import ConnectorValidationError from onyx.connectors.exceptions import CredentialExpiredError @@ -40,16 +39,12 @@ from onyx.connectors.google_drive.models import RetrievedDriveFile from onyx.connectors.google_drive.models import StageCompletion from onyx.connectors.google_utils.google_auth import get_google_creds -from onyx.connectors.google_utils.google_utils import create_scripts_file_objects from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval from onyx.connectors.google_utils.google_utils import GoogleFields -from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME from onyx.connectors.google_utils.resources import get_admin_service from onyx.connectors.google_utils.resources import get_drive_service from onyx.connectors.google_utils.resources import get_google_docs_service -from onyx.connectors.google_utils.resources import get_google_scripts_service from onyx.connectors.google_utils.resources import GoogleDriveService -from onyx.connectors.google_utils.resources import GoogleScriptsService from onyx.connectors.google_utils.shared_constants import ( DB_CREDENTIALS_PRIMARY_ADMIN_KEY, ) @@ -95,7 +90,6 @@ def _convert_single_file( creds: Any, allow_images: bool, size_threshold: int, - smart_chips_deployment_id: str, retriever_email: str, file: dict[str, Any], ) -> Document | ConnectorFailure | None: @@ -113,15 +107,10 @@ def _convert_single_file( docs_service = lazy_eval( lambda: get_google_docs_service(creds, user_email=user_email) ) - scripts_service = lazy_eval( - lambda: get_google_scripts_service(creds, user_email=user_email) - ) return convert_drive_item_to_document( file=file, drive_service=user_drive_service, docs_service=docs_service, - scripts_service=scripts_service, - smart_chips_deployment_id=smart_chips_deployment_id, allow_images=allow_images, size_threshold=size_threshold, ) @@ -187,7 +176,6 @@ def __init__( my_drive_emails: str | None = None, shared_folder_urls: str | None = None, batch_size: int = INDEX_BATCH_SIZE, - smart_chip_deployment_id: str = "", # OLD PARAMETERS folder_paths: list[str] | None = None, include_shared: bool | None = None, @@ -260,8 +248,6 @@ def __init__( self._retrieved_ids: set[str] = set() self.allow_images = False - self.smart_chip_deployment_id = smart_chip_deployment_id - self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD def set_allow_images(self, value: bool) -> None: @@ -309,108 +295,8 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None source=DocumentSource.GOOGLE_DRIVE, ) - if USE_SMART_CHIP_SCOPES: - self.upsert_smart_chip_app_script() - return new_creds_dict - @staticmethod - def _get_latest_deployment( - scripts_service: GoogleScriptsService, script_id: str - ) -> dict[str, Any]: - deployments = ( - scripts_service.projects() - .deployments() - .list( - scriptId=script_id, - ) - .execute() - ) - all_deployments = deployments.get("deployments", []) - while "nextPageToken" in deployments: - deployments = ( - scripts_service.projects() - .deployments() - .list( - scriptId=script_id, - pageToken=deployments["nextPageToken"], - ) - .execute() - ) - all_deployments.extend(deployments.get("deployments", [])) - - if len(all_deployments) == 0: - raise RuntimeError(f"No deployments found for script {script_id}") - return max( - all_deployments, - key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(), - ) - - def upsert_smart_chip_app_script(self) -> None: - assert self._creds is not None, "creds not set" - - # If a deployment id is provided, we don't need to create a new script. - # The deployment id can be retrieved by going under - # Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com) - if self.smart_chip_deployment_id: - return - - # Step 1: Check if the script already exists by searching the admin drive. - drive_service = get_drive_service( - self._creds, user_email=self.primary_admin_email - ) - q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false" - script_search = ( - drive_service.files() - .list( - corpora="user", - fields="files(mimeType, id, name)", - q=q, - ) - .execute() - ) - script_id = (script_search.get("files") or [{}])[0].get("id") - scripts_service = get_google_scripts_service( - self._creds, user_email=self.primary_admin_email - ) - if not script_id: - # Step 2: Create the script if nonexistent - # (Takes about ~10 seconds) - req = scripts_service.projects().create( - body={"title": SMART_CHIP_SCRIPT_FILE_NAME} - ) - response = req.execute() - - if "scriptId" not in response: - raise RuntimeError( - f"Failed to create Smart Chip App Script: {response}" - ) - - script_id = response["scriptId"] - scripts_files = create_scripts_file_objects() - # Step 3: Update (upload) the script content - response = ( - scripts_service.projects() - .updateContent(scriptId=script_id, body={"files": scripts_files}) - .execute() - ) - - if "scriptId" not in response: - raise RuntimeError( - f"Failed to update Smart Chip App Script: {response}" - ) - - script_id = response["scriptId"] - - # Step 4: Get the deployment id - self.smart_chip_deployment_id = self._get_latest_deployment( - scripts_service, script_id - )["deploymentId"] - - # TODO: upsert new version if out of date. We don't expect to do this often. - # One way would be to check whether the script files have changed (either via git - # or actually pulling the current content and comparing). - def _update_traversed_parent_ids(self, folder_id: str) -> None: self._retrieved_ids.add(folder_id) @@ -1065,7 +951,6 @@ def _extract_docs_from_google_drive( self.creds, self.allow_images, self.size_threshold, - self.smart_chip_deployment_id, ) # Fetch files in batches batches_complete = 0 diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index cd4ff05914..e80f6226d0 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -12,9 +12,9 @@ from onyx.connectors.google_drive.models import GDriveMimeType from onyx.connectors.google_drive.models import GoogleDriveFileType from onyx.connectors.google_drive.section_extraction import get_document_sections +from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER from onyx.connectors.google_utils.resources import GoogleDocsService from onyx.connectors.google_utils.resources import GoogleDriveService -from onyx.connectors.google_utils.resources import GoogleScriptsService from onyx.connectors.models import ConnectorFailure from onyx.connectors.models import Document from onyx.connectors.models import DocumentFailure @@ -36,6 +36,10 @@ logger = setup_logger() +# This is not a standard valid unicode char, it is used by the docs advanced API to +# represent smart chips (elements like dates and doc links). +SMART_CHIP_CHAR = "\ue907" + # Mapping of Google Drive mime types to export formats GOOGLE_MIME_TYPES_TO_EXPORT = { GDriveMimeType.DOC.value: "text/plain", @@ -221,12 +225,79 @@ def _download_and_extract_sections_basic( return [] +def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int: + start = haystack.find(needle, start) + while start >= 0 and n > 1: + start = haystack.find(needle, start + len(needle)) + n -= 1 + return start + + +def align_basic_advanced( + basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection] +) -> list[TextSection | ImageSection]: + """Align the basic sections with the advanced sections. + In particular, the basic sections contain all content of the file, + including smart chips like dates and doc links. The advanced sections + are separated by section headers and contain header-based links that + improve user experience when they click on the source in the UI. + + There are edge cases in text matching (i.e. the heading is a smart chip or + there is a smart chip in the doc with text containing the actual heading text) + that make the matching imperfect; this is hence done on a best-effort basis. + """ + if len(adv_sections) <= 1: + return basic_sections # no benefit from aligning + + basic_full_text = "".join( + [section.text for section in basic_sections if isinstance(section, TextSection)] + ) + new_sections: list[TextSection | ImageSection] = [] + heading_start = 0 + for adv_ind in range(1, len(adv_sections)): + heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0] + # retrieve the longest part of the heading that is not a smart chip + heading_key = max(heading.split(SMART_CHIP_CHAR), key=len).strip() + if heading_key == "": + logger.warning( + f"Cannot match heading: {heading}, its link will come from the following section" + ) + continue + heading_offset = heading.find(heading_key) + + # count occurrences of heading str in previous section + heading_count = adv_sections[adv_ind - 1].text.count(heading_key) + + prev_start = heading_start + heading_start = ( + _find_nth(basic_full_text, heading_key, heading_count, start=prev_start) + - heading_offset + ) + if heading_start < 0: + logger.warning( + f"Heading key {heading_key} from heading {heading} not found in basic text" + ) + heading_start = prev_start + continue + + new_sections.append( + TextSection( + link=adv_sections[adv_ind - 1].link, + text=basic_full_text[prev_start:heading_start], + ) + ) + + # handle last section + new_sections.append( + TextSection(link=adv_sections[-1].link, text=basic_full_text[heading_start:]) + ) + return new_sections + + def convert_drive_item_to_document( file: GoogleDriveFileType, drive_service: Callable[[], GoogleDriveService], docs_service: Callable[[], GoogleDocsService], - scripts_service: Callable[[], GoogleScriptsService], - smart_chips_deployment_id: str, allow_images: bool, size_threshold: int, ) -> Document | ConnectorFailure | None: @@ -248,12 +319,16 @@ def convert_drive_item_to_document( # get_document_sections is the advanced approach for Google Docs doc_sections = get_document_sections( docs_service=docs_service(), - scripts_service=scripts_service(), - smart_chips_deployment_id=smart_chips_deployment_id, doc_id=file.get("id", ""), ) if doc_sections: sections = cast(list[TextSection | ImageSection], doc_sections) + if any(SMART_CHIP_CHAR in section.text for section in doc_sections): + basic_sections = _download_and_extract_sections_basic( + file, drive_service(), allow_images + ) + sections = align_basic_advanced(basic_sections, doc_sections) + except Exception as e: logger.warning( f"Error in advanced parsing: {e}. Falling back to basic extraction." diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py index 2d78e31276..701d5d0c30 100644 --- a/backend/onyx/connectors/google_drive/section_extraction.py +++ b/backend/onyx/connectors/google_drive/section_extraction.py @@ -1,16 +1,11 @@ -from collections.abc import Callable from typing import Any from pydantic import BaseModel -from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES from onyx.connectors.google_utils.resources import GoogleDocsService -from onyx.connectors.google_utils.resources import GoogleScriptsService from onyx.connectors.models import TextSection - -DRIVE_CHIP_CHAR = "\ue907" -UNKNOWN_SMART_CHIP_STR = "" +HEADING_DELIMITER = "\n" class CurrentHeading(BaseModel): @@ -31,9 +26,7 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str: return paragraph["paragraphStyle"]["headingId"] -def _extract_text_from_paragraph( - paragraph: dict[str, Any], extract_chip: Callable[[int], str | None] -) -> tuple[str, int]: +def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str: """Extracts the text content from a paragraph element""" text_elements = [] for element in paragraph.get("elements", []): @@ -62,62 +55,14 @@ def _extract_text_from_paragraph( link_str = f"[{title}]({uri})" text_elements.append(link_str) - ret = "".join(text_elements) - - # add chip strings in place of each non-text - text_chunks = ret.split(DRIVE_CHIP_CHAR) - num_non_text_elements = len(text_chunks) - 1 - for i in range(num_non_text_elements): - text_chunks[i] += extract_chip(i) or UNKNOWN_SMART_CHIP_STR - return "".join(text_chunks), num_non_text_elements - - -def _extract_smart_chips_from_document( - document_id: str, - scripts_service: GoogleScriptsService, - deployment_id: str, -) -> dict[str, str]: - """Extracts smart chips from a Google Doc. Returns a dictionary where - the keys are the smart chip location keys and the values are the smart chip text. - - This calls a Google Apps Script function, because most smart chips are not currently - available through the API https://issuetracker.google.com/issues/225584757 + return "".join(text_elements) - Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum". - nonTextIndexNum refers to the index at which the value was found while traversing - the paragraph or table cell from left to right, top to bottom. - There are many non-text elements that are currently not supported by Apps Script, (see - https://developers.google.com/apps-script/reference/document/element-type ), so some - non-text elements won't have an associated text value. - """ - - # NOTE: the documentation is incorrect; the script id you must specify is - # actually the deployment id (what comes up when you go to Deploy-> Test Deployments) - http_request = scripts_service.scripts().run( - scriptId=deployment_id, - body={ - "function": "docToChips", - "parameters": [document_id], - # "devMode": True - }, - ) - doc = http_request.execute() - return doc.get("response", {}).get("result", {}) - - -def _extract_text_from_table( - table: dict[str, Any], extract_chip: Callable[[int], str | None] -) -> str: +def _extract_text_from_table(table: dict[str, Any]) -> str: """ Extracts the text content from a table element. - Smart chip extraction will be wrong for nested tables. """ row_strs = [] - seen_non_text = 0 - - def table_extract_chip(non_text_index: int) -> str | None: - return extract_chip(non_text_index + seen_non_text) for row in table.get("tableRows", []): cells = row.get("tableCells", []) @@ -128,12 +73,7 @@ def table_extract_chip(non_text_index: int) -> str | None: for child_elem in child_elements: if "paragraph" not in child_elem: continue - text, num_non_text_elements = _extract_text_from_paragraph( - child_elem["paragraph"], table_extract_chip - ) - cell_str.append(text) - seen_non_text += num_non_text_elements - + cell_str.append(_extract_text_from_paragraph(child_elem["paragraph"])) cell_strs.append("".join(cell_str)) row_strs.append(", ".join(cell_strs)) return "\n".join(row_strs) @@ -141,8 +81,6 @@ def table_extract_chip(non_text_index: int) -> str | None: def get_document_sections( docs_service: GoogleDocsService, - scripts_service: GoogleScriptsService, - smart_chips_deployment_id: str, doc_id: str, ) -> list[TextSection]: """Extracts sections from a Google Doc, including their headings and content""" @@ -159,18 +97,11 @@ def get_document_sections( http_request.uri += "&includeTabsContent=true" doc = http_request.execute() - smart_chips = {} - if USE_SMART_CHIP_SCOPES: - # Get the smart chips - smart_chips = _extract_smart_chips_from_document( - doc_id, scripts_service, smart_chips_deployment_id - ) - # Get the content tabs = doc.get("tabs", {}) sections: list[TextSection] = [] - for tab_num, tab in enumerate(tabs): - sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips)) + for tab in tabs: + sections.extend(get_tab_sections(tab, doc_id)) return sections @@ -202,7 +133,10 @@ def _add_finished_section( if not (current_section or current_heading.text): return # If we were building a previous section, add it to sections list - section_text = f"{current_heading.text}\n" + "\n".join(current_section) + + # this is unlikely to ever matter, but helps if the doc contains weird headings + header_text = current_heading.text.replace(HEADING_DELIMITER, "") + section_text = f"{header_text}{HEADING_DELIMITER}" + "\n".join(current_section) sections.append( TextSection( text=section_text.strip(), @@ -211,9 +145,7 @@ def _add_finished_section( ) -def get_tab_sections( - tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str] -) -> list[TextSection]: +def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]: tab_id = tab["tabProperties"]["tabId"] content = tab.get("documentTab", {}).get("body", {}).get("content", []) @@ -221,17 +153,13 @@ def get_tab_sections( current_section: list[str] = [] current_heading = CurrentHeading(id=None, text="") - for element_num, element in enumerate(content): - - def extract_chip(non_text_index: int) -> str | None: - return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}") - + for element in content: if "paragraph" in element: paragraph = element["paragraph"] # If this is not a heading, add content to current section if not _is_heading(paragraph): - text, _ = _extract_text_from_paragraph(paragraph, extract_chip) + text = _extract_text_from_paragraph(paragraph) if text.strip(): current_section.append(text) continue @@ -244,13 +172,13 @@ def extract_chip(non_text_index: int) -> str | None: # Start new heading heading_id = _extract_id_from_heading(paragraph) - heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip) + heading_text = _extract_text_from_paragraph(paragraph) current_heading = CurrentHeading( id=heading_id, text=heading_text, ) elif "table" in element: - text = _extract_text_from_table(element["table"], extract_chip) + text = _extract_text_from_table(element["table"]) if text.strip(): current_section.append(text) diff --git a/backend/onyx/connectors/google_utils/google_utils.py b/backend/onyx/connectors/google_utils/google_utils.py index 51ec2181a3..4ad6cbe7af 100644 --- a/backend/onyx/connectors/google_utils/google_utils.py +++ b/backend/onyx/connectors/google_utils/google_utils.py @@ -1,4 +1,3 @@ -import json import re import time from collections.abc import Callable @@ -142,50 +141,3 @@ def execute_paginated_retrieval( yield item else: yield results - - -# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType -class AppsScriptFileType(str, Enum): - UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED" - SERVER_JS = "SERVER_JS" - HTML = "HTML" - JSON = "JSON" - - -SMART_CHIP_RETRIEVAL_FUNCTIONS = [ - ("docToChips", ["document_id"]), - ("getKey", ["tabInd", "paragraphInd", "nonTextInd"]), - ("parseParagraph", ["paragraph", "callback"]), - ("parseTable", ["table", "callback"]), -] - -SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor" - - -# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent -def create_scripts_file_objects() -> list[GoogleDriveFileType]: - with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f: - script_source = f.read() - with open("onyx/connectors/google_drive/appsscript.json", "r") as f: - appsscript_source = json.loads(f.read()) - return [ - { - "name": "appsscript", - "type": AppsScriptFileType.JSON.value, - "source": json.dumps(appsscript_source), - }, - { - "name": SMART_CHIP_SCRIPT_FILE_NAME, - "type": AppsScriptFileType.SERVER_JS.value, - "source": script_source, - "functionSet": { - "values": [ - { - "name": name, - "parameters": params, - } - for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS - ], - }, - }, - ] diff --git a/backend/onyx/connectors/google_utils/resources.py b/backend/onyx/connectors/google_utils/resources.py index a12f30cf67..48bd981c2b 100644 --- a/backend/onyx/connectors/google_utils/resources.py +++ b/backend/onyx/connectors/google_utils/resources.py @@ -12,10 +12,6 @@ class GoogleDocsService(Resource): pass -class GoogleScriptsService(Resource): - pass - - class AdminService(Resource): pass @@ -66,10 +62,3 @@ def get_gmail_service( user_email: str | None = None, ) -> GmailService: return _get_google_service("gmail", "v1", creds, user_email) - - -def get_google_scripts_service( - creds: ServiceAccountCredentials | OAuthCredentials, - user_email: str | None = None, -) -> GoogleScriptsService: - return _get_google_service("script", "v1", creds, user_email) diff --git a/backend/onyx/connectors/google_utils/shared_constants.py b/backend/onyx/connectors/google_utils/shared_constants.py index 483ea6dd35..bd5ebb11ec 100644 --- a/backend/onyx/connectors/google_utils/shared_constants.py +++ b/backend/onyx/connectors/google_utils/shared_constants.py @@ -1,6 +1,5 @@ from enum import Enum as PyEnum -from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES from onyx.configs.constants import DocumentSource # NOTE: do not need https://www.googleapis.com/auth/documents.readonly @@ -19,19 +18,6 @@ ], } -# TODO: add this to the docs -GOOGLE_SMART_CHIP_SCOPES = [ - "https://www.googleapis.com/auth/script.external_request", - "https://www.googleapis.com/auth/drive.scripts", - "https://www.googleapis.com/auth/script.scriptapp", - "https://www.googleapis.com/auth/script.deployments", - "https://www.googleapis.com/auth/script.projects", - "https://www.googleapis.com/auth/documents", -] - -if USE_SMART_CHIP_SCOPES: - GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES - # This is the Oauth token DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens" # This is the service account key diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py index 4dfadb1d9c..4fe586897b 100644 --- a/backend/onyx/connectors/models.py +++ b/backend/onyx/connectors/models.py @@ -39,7 +39,6 @@ class TextSection(Section): """Section containing text content""" text: str - link: str | None = None def __sizeof__(self) -> int: return sys.getsizeof(self.text) + sys.getsizeof(self.link) @@ -49,7 +48,6 @@ class ImageSection(Section): """Section containing an image reference""" image_file_name: str - link: str | None = None def __sizeof__(self) -> int: return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link) From a226361d8b4bf82779d2ea686709d95588b3a086 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 4 Apr 2025 17:52:52 -0700 Subject: [PATCH 5/7] remove files from previous branch --- .../connectors/google_drive/appsscript.json | 17 --- .../google_drive/smart_chip_retrieval.gs | 132 ------------------ 2 files changed, 149 deletions(-) delete mode 100644 backend/onyx/connectors/google_drive/appsscript.json delete mode 100644 backend/onyx/connectors/google_drive/smart_chip_retrieval.gs diff --git a/backend/onyx/connectors/google_drive/appsscript.json b/backend/onyx/connectors/google_drive/appsscript.json deleted file mode 100644 index 1a9e54269b..0000000000 --- a/backend/onyx/connectors/google_drive/appsscript.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "timeZone": "America/Los_Angeles", - "dependencies": { - "enabledAdvancedServices": [ - { - "userSymbol": "Docs", - "version": "v1", - "serviceId": "docs" - } - ] - }, - "exceptionLogging": "STACKDRIVER", - "runtimeVersion": "V8", - "executionApi": { - "access": "MYSELF" - } - } \ No newline at end of file diff --git a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs deleted file mode 100644 index 0419adc179..0000000000 --- a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs +++ /dev/null @@ -1,132 +0,0 @@ - -/** - * Retrieves the given Google doc by id and extracts dates, people, and rich links - * from it into a JSON keyed by tab, paragraph, and non-text-component index. - * - */ -function docToChips(document_id) { - const doc = DocumentApp.openById(document_id); - const tabs = doc.getTabs(); - const ret = new Map(); - tabs.map((tab, tabInd) => { - const docTab = tab.asDocumentTab(); - const body = docTab.getBody(); - for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) { - var tabChild = body.getChild(tabChildInd); - var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;}); - switch (tabChild.getType()) { - case DocumentApp.ElementType.PARAGRAPH: - parseParagraph(tabChild.asParagraph(), callback); - console.log("paragraph", tabChild.asParagraph().getText()); - break; - case DocumentApp.ElementType.TABLE: - console.log("table"); - parseTable(tabChild.asTable(), callback); - break; - case DocumentApp.ElementType.LIST_ITEM: - var listItem = tabChild.asListItem(); - //console.log("list item:", listItem.getText(), listItem.getNumChildren()); - //console.log(listItem.getChild(0).asText().getText()); - parseParagraph(tabChild.asListItem(), callback); - break; - default: - console.log("found unknown tab body child of type: ", tabChild.getType().toString()); - } - } - }); - console.log(ret); - return ret; -} - -// uncomment and paste in a file id (and change the main function to "test") -// to test the docToChips function -// function test() { -// return docToChips("document id goes here"); -// } - -function getKey(tabInd, paragraphInd, nonTextInd) { - return tabInd + "_" + paragraphInd + "_" + nonTextInd; -} - -// also used for list items -function parseParagraph(paragraph, callback) { - var nonTextInd = 0; - for (let i = 0; i < paragraph.getNumChildren(); i++) { // - var child = paragraph.getChild(i); - switch (child.getType()) { - case DocumentApp.ElementType.DATE: - console.log(child.asDate().getDisplayText()); - callback(nonTextInd, child.asDate().getDisplayText()); - break; - case DocumentApp.ElementType.EQUATION: - var eqStr = child.getText(); - console.log("equation: ", eqStr); - callback(nonTextInd, eqStr); - break; - case DocumentApp.ElementType.PERSON: - var personStr = ""; - console.log(personStr); - //callback(nonTextInd, personStr); - nonTextInd--; // Advanced Docs API picks up people - break; - case DocumentApp.ElementType.RICH_LINK: - var richLink = child.asRichLink() - var linkStr = "" - console.log(linkStr); - // callback(nonTextInd, child.asRichLink().getUrl()); - nonTextInd--; // Advanced Docs API picks up rich links - break; - case DocumentApp.ElementType.TEXT: - console.log("text: "+ child.asText().getText()); - //console.log(child.asText().) - nonTextInd--; - break; - case DocumentApp.ElementType.UNSUPPORTED: - console.log("unsupported element type"); - break; - default: - console.log("found special element type:", child.getType().toString()); - } - nonTextInd++; - } -} - -function parseTable(table, callback) { - var lastSeenInCell = 0; - var allSeenElems = 0 - const tableCallback = ((nonTextInd, replaceText) => { - callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText); - lastSeenInCell++; - }); - for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) { - var row = table.getChild(rowInd); - if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) { - console.log("table child type: ", row.getType().toString()); - continue; - } - - for (let colInd = 0; colInd < row.getNumChildren(); colInd++) { - var cell = row.getChild(colInd); - if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) { - console.log("row child type: ", cell.getType().toString()); - continue; - } - - for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) { - var item = cell.getChild(itemInd); - console.log(item.getType().toString()); - switch (item.getType()) { - case DocumentApp.ElementType.PARAGRAPH: - case DocumentApp.ElementType.LIST_ITEM: - parseParagraph(item, tableCallback); - break; - case DocumentApp.ElementType.TABLE: - parseTable(item, tableCallback); - break; - } - } - allSeenElems += lastSeenInCell; - lastSeenInCell = 0; - } - } -} \ No newline at end of file From 47a1f8685e2885560fa26d5a844cded1c6e7ead2 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 4 Apr 2025 17:55:12 -0700 Subject: [PATCH 6/7] fix connector tests --- .../daily/connectors/google_drive/consts_and_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/tests/daily/connectors/google_drive/consts_and_utils.py b/backend/tests/daily/connectors/google_drive/consts_and_utils.py index f08a587870..678d958da0 100644 --- a/backend/tests/daily/connectors/google_drive/consts_and_utils.py +++ b/backend/tests/daily/connectors/google_drive/consts_and_utils.py @@ -123,15 +123,15 @@ SPECIAL_FILE_ID_TO_CONTENT_MAP: dict[int, str] = { 61: ( - "Title\n\n" + "Title\n" "This is a Google Doc with sections - " - "Section 1\n\n" + "Section 1\n" "Section 1 content - " - "Sub-Section 1-1\n\n" + "Sub-Section 1-1\n" "Sub-Section 1-1 content - " - "Sub-Section 1-2\n\n" + "Sub-Section 1-2\n" "Sub-Section 1-2 content - " - "Section 2\n\n" + "Section 2\n" "Section 2 content" ), } From 2356674d0990088b63e49d968d7ff3250fa98577 Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 4 Apr 2025 20:25:05 -0700 Subject: [PATCH 7/7] fix test --- .../daily/connectors/google_drive/test_sections.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/tests/daily/connectors/google_drive/test_sections.py b/backend/tests/daily/connectors/google_drive/test_sections.py index 6dd22a0e52..25237d701f 100644 --- a/backend/tests/daily/connectors/google_drive/test_sections.py +++ b/backend/tests/daily/connectors/google_drive/test_sections.py @@ -46,28 +46,28 @@ def test_google_drive_sections( assert len(doc.sections) == 5 header_section = doc.sections[0] - assert header_section.text == "Title\n\nThis is a Google Doc with sections" + assert header_section.text == "Title\nThis is a Google Doc with sections" assert header_section.link is not None assert header_section.link.endswith( "?tab=t.0#heading=h.hfjc17k6qwzt" ) or header_section.link.endswith("?tab=t.0#heading=h.hfjc17k6qwzt") section_1 = doc.sections[1] - assert section_1.text == "Section 1\n\nSection 1 content" + assert section_1.text == "Section 1\nSection 1 content" assert section_1.link is not None assert section_1.link.endswith("?tab=t.0#heading=h.8slfx752a3g5") section_2 = doc.sections[2] - assert section_2.text == "Sub-Section 1-1\n\nSub-Section 1-1 content" + assert section_2.text == "Sub-Section 1-1\nSub-Section 1-1 content" assert section_2.link is not None assert section_2.link.endswith("?tab=t.0#heading=h.4kj3ayade1bp") section_3 = doc.sections[3] - assert section_3.text == "Sub-Section 1-2\n\nSub-Section 1-2 content" + assert section_3.text == "Sub-Section 1-2\nSub-Section 1-2 content" assert section_3.link is not None assert section_3.link.endswith("?tab=t.0#heading=h.pm6wrpzgk69l") section_4 = doc.sections[4] - assert section_4.text == "Section 2\n\nSection 2 content" + assert section_4.text == "Section 2\nSection 2 content" assert section_4.link is not None assert section_4.link.endswith("?tab=t.0#heading=h.2m0s9youe2k9")