From ebfaecd366020367196af09baf94da01ab63dcaa Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Wed, 2 Apr 2025 20:55:20 -0700
Subject: [PATCH 1/7] WIP

---
 backend/onyx/chat/process_message.py          |  12 +-
 .../google_drive/section_extraction.py        | 150 +++++++++++-------
 2 files changed, 101 insertions(+), 61 deletions(-)

diff --git a/backend/onyx/chat/process_message.py b/backend/onyx/chat/process_message.py
index eea541aca7..cea03aeba0 100644
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -1089,14 +1089,10 @@ def stream_chat_message_objects(
                         selected_search_docs=selected_db_search_docs,
                         # Deduping happens at the last step to avoid harming quality by dropping content early on
                         # Skip deduping completely for ordering-only mode to save time
-                        dedupe_docs=(
-                            False
-                            if search_for_ordering_only
-                            else (
-                                retrieval_options.dedupe_docs
-                                if retrieval_options
-                                else False
-                            )
+                        dedupe_docs=bool(
+                            not search_for_ordering_only
+                            and retrieval_options
+                            and retrieval_options.dedupe_docs
                         ),
                         user_files=user_file_files if search_for_ordering_only else [],
                         loaded_user_files=user_files
diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py
index 0c63eba5c6..e571302fbc 100644
--- a/backend/onyx/connectors/google_drive/section_extraction.py
+++ b/backend/onyx/connectors/google_drive/section_extraction.py
@@ -7,17 +7,16 @@
 
 
 class CurrentHeading(BaseModel):
-    id: str
+    id: str | None
     text: str
 
 
-def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
+def _build_gdoc_section_link(doc_id: str, tab_id: str, heading_id: str | None) -> str:
     """Builds a Google Doc link that jumps to a specific heading"""
     # NOTE: doesn't support docs with multiple tabs atm, if we need that ask
     # @Chris
-    return (
-        f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
-    )
+    heading_str = f"#heading={heading_id}" if heading_id else ""
+    return f"https://docs.google.com/document/d/{doc_id}/edit?tab={tab_id}{heading_str}"
 
 
 def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
@@ -31,75 +30,120 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
     for element in paragraph.get("elements", []):
         if "textRun" in element:
             text_elements.append(element["textRun"].get("content", ""))
+
+        # Handle links
+        if "textStyle" in element and "link" in element["textStyle"]:
+            text_elements.append(f"[{element['textStyle']['link'].get('url', '')}]")
     return "".join(text_elements)
 
 
+def _extract_text_from_table(table: dict[str, Any]) -> str:
+    """Extracts the text content from a table element"""
+    row_strs = []
+    for row in table.get("tableRows", []):
+        cells = row.get("tableCells", [])
+        cell_strs = [
+            _extract_text_from_paragraph(cell.get("content", {})) for cell in cells
+        ]
+        row_strs.append(", ".join(cell_strs))
+    return "\n".join(row_strs)
+
+
 def get_document_sections(
     docs_service: GoogleDocsService,
     doc_id: str,
 ) -> list[TextSection]:
     """Extracts sections from a Google Doc, including their headings and content"""
     # Fetch the document structure
-    doc = docs_service.documents().get(documentId=doc_id).execute()
+    doc = (
+        docs_service.documents()
+        .get(documentId=doc_id, includeTabsContent=True)
+        .execute()
+    )
 
     # Get the content
-    content = doc.get("body", {}).get("content", [])
+    tabs = doc.get("tabs", {})
+    sections: list[TextSection] = []
+    for tab in tabs:
+        sections.extend(get_tab_sections(tab, doc_id))
+    return sections
+
+
+def _is_heading(paragraph: dict[str, Any]) -> bool:
+    """Checks if a paragraph (a block of text in a drive document) is a heading"""
+    if not (
+        "paragraphStyle" in paragraph
+        and "namedStyleType" in paragraph["paragraphStyle"]
+    ):
+        return False
+
+    style = paragraph["paragraphStyle"]["namedStyleType"]
+    is_heading = style.startswith("HEADING_")
+    is_title = style.startswith("TITLE")
+    return is_heading or is_title
+
+
+def _add_finished_section(
+    sections: list[TextSection],
+    doc_id: str,
+    tab_id: str,
+    current_heading: CurrentHeading,
+    current_section: list[str],
+) -> None:
+    """Adds a finished section to the list of sections if the section has content.
+    Returns the list of sections to use going forward, which may be the old list
+    if a new section was not added.
+    """
+    if not (current_section or current_heading.text):
+        return
+    # If we were building a previous section, add it to sections list
+    section_text = f"{current_heading.text}\n" + "\n".join(current_section)
+    sections.append(
+        TextSection(
+            text=section_text.strip(),
+            link=_build_gdoc_section_link(doc_id, tab_id, current_heading.id),
+        )
+    )
+
+
+def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
+    tab_id = tab["tabProperties"]["tabId"]
+    content = tab.get("documentTab", {}).get("body", {}).get("content", [])
 
     sections: list[TextSection] = []
     current_section: list[str] = []
-    current_heading: CurrentHeading | None = None
+    current_heading = CurrentHeading(id=None, text="")
 
     for element in content:
-        if "paragraph" not in element:
-            continue
-
-        paragraph = element["paragraph"]
-
-        # Check if this is a heading
-        if (
-            "paragraphStyle" in paragraph
-            and "namedStyleType" in paragraph["paragraphStyle"]
-        ):
-            style = paragraph["paragraphStyle"]["namedStyleType"]
-            is_heading = style.startswith("HEADING_")
-            is_title = style.startswith("TITLE")
-
-            if is_heading or is_title:
-                # If we were building a previous section, add it to sections list
-                if current_heading is not None and current_section:
-                    heading_text = current_heading.text
-                    section_text = f"{heading_text}\n" + "\n".join(current_section)
-                    sections.append(
-                        TextSection(
-                            text=section_text.strip(),
-                            link=_build_gdoc_section_link(doc_id, current_heading.id),
-                        )
-                    )
-                    current_section = []
-
-                # Start new heading
-                heading_id = _extract_id_from_heading(paragraph)
-                heading_text = _extract_text_from_paragraph(paragraph)
-                current_heading = CurrentHeading(
-                    id=heading_id,
-                    text=heading_text,
-                )
+        if "paragraph" in element:
+            paragraph = element["paragraph"]
+
+            # If this is not a heading, add content to current section
+            if not _is_heading(paragraph):
+                text = _extract_text_from_paragraph(paragraph)
+                if text.strip():
+                    current_section.append(text)
                 continue
 
-        # Add content to current section
-        if current_heading is not None:
-            text = _extract_text_from_paragraph(paragraph)
+            _add_finished_section(
+                sections, doc_id, tab_id, current_heading, current_section
+            )
+
+            current_section = []
+
+            # Start new heading
+            heading_id = _extract_id_from_heading(paragraph)
+            heading_text = _extract_text_from_paragraph(paragraph)
+            current_heading = CurrentHeading(
+                id=heading_id,
+                text=heading_text,
+            )
+        elif "table" in element:
+            text = _extract_text_from_table(element["table"])
             if text.strip():
                 current_section.append(text)
 
     # Don't forget to add the last section
-    if current_heading is not None and current_section:
-        section_text = f"{current_heading.text}\n" + "\n".join(current_section)
-        sections.append(
-            TextSection(
-                text=section_text.strip(),
-                link=_build_gdoc_section_link(doc_id, current_heading.id),
-            )
-        )
+    _add_finished_section(sections, doc_id, tab_id, current_heading, current_section)
 
     return sections

From a04132fdc83dae940991a21d8f90ecd153532618 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 4 Apr 2025 12:49:45 -0700
Subject: [PATCH 2/7] WIP almost done, but realized we can just do basic
 retrieval

---
 backend/onyx/configs/app_configs.py           |   2 +
 .../onyx/connectors/google_drive/connector.py | 115 +++++++++++++
 .../connectors/google_drive/doc_conversion.py |   8 +-
 .../google_drive/section_extraction.py        | 151 +++++++++++++++---
 .../connectors/google_utils/google_utils.py   |  48 ++++++
 .../onyx/connectors/google_utils/resources.py |  11 ++
 .../google_utils/shared_constants.py          |  14 ++
 7 files changed, 328 insertions(+), 21 deletions(-)

diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index a293df6248..f45808f937 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -163,6 +163,8 @@
 
 MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4))
 
+USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true"
+
 # Below are intended to match the env variables names used by the official postgres docker image
 # https://hub.docker.com/_/postgres
 POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py
index f9764535b2..551a10a73d 100644
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@@ -18,6 +18,7 @@
 from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.app_configs import MAX_DRIVE_WORKERS
+from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.connectors.exceptions import CredentialExpiredError
@@ -39,12 +40,16 @@
 from onyx.connectors.google_drive.models import RetrievedDriveFile
 from onyx.connectors.google_drive.models import StageCompletion
 from onyx.connectors.google_utils.google_auth import get_google_creds
+from onyx.connectors.google_utils.google_utils import create_scripts_file_objects
 from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
 from onyx.connectors.google_utils.google_utils import GoogleFields
+from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME
 from onyx.connectors.google_utils.resources import get_admin_service
 from onyx.connectors.google_utils.resources import get_drive_service
 from onyx.connectors.google_utils.resources import get_google_docs_service
+from onyx.connectors.google_utils.resources import get_google_scripts_service
 from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.google_utils.shared_constants import (
     DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
 )
@@ -90,6 +95,7 @@ def _convert_single_file(
     creds: Any,
     allow_images: bool,
     size_threshold: int,
+    smart_chips_deployment_id: str,
     retriever_email: str,
     file: dict[str, Any],
 ) -> Document | ConnectorFailure | None:
@@ -107,10 +113,15 @@ def _convert_single_file(
     docs_service = lazy_eval(
         lambda: get_google_docs_service(creds, user_email=user_email)
     )
+    scripts_service = lazy_eval(
+        lambda: get_google_scripts_service(creds, user_email=user_email)
+    )
     return convert_drive_item_to_document(
         file=file,
         drive_service=user_drive_service,
         docs_service=docs_service,
+        scripts_service=scripts_service,
+        smart_chips_deployment_id=smart_chips_deployment_id,
         allow_images=allow_images,
         size_threshold=size_threshold,
     )
@@ -176,6 +187,7 @@ def __init__(
         my_drive_emails: str | None = None,
         shared_folder_urls: str | None = None,
         batch_size: int = INDEX_BATCH_SIZE,
+        smart_chip_deployment_id: str = "",
         # OLD PARAMETERS
         folder_paths: list[str] | None = None,
         include_shared: bool | None = None,
@@ -248,6 +260,8 @@ def __init__(
         self._retrieved_ids: set[str] = set()
         self.allow_images = False
 
+        self.smart_chip_deployment_id = smart_chip_deployment_id
+
         self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
 
     def set_allow_images(self, value: bool) -> None:
@@ -295,8 +309,108 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None
             source=DocumentSource.GOOGLE_DRIVE,
         )
 
+        if USE_SMART_CHIP_SCOPES:
+            self.upsert_smart_chip_app_script()
+
         return new_creds_dict
 
+    @staticmethod
+    def _get_latest_deployment(
+        scripts_service: GoogleScriptsService, script_id: str
+    ) -> dict[str, Any]:
+        deployments = (
+            scripts_service.projects()
+            .deployments()
+            .list(
+                scriptId=script_id,
+            )
+            .execute()
+        )
+        all_deployments = deployments.get("deployments", [])
+        while "nextPageToken" in deployments:
+            deployments = (
+                scripts_service.projects()
+                .deployments()
+                .list(
+                    scriptId=script_id,
+                    pageToken=deployments["nextPageToken"],
+                )
+                .execute()
+            )
+            all_deployments.extend(deployments.get("deployments", []))
+
+        if len(all_deployments) == 0:
+            raise RuntimeError(f"No deployments found for script {script_id}")
+        return max(
+            all_deployments,
+            key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(),
+        )
+
+    def upsert_smart_chip_app_script(self) -> None:
+        assert self._creds is not None, "creds not set"
+
+        # If a deployment id is provided, we don't need to create a new script.
+        # The deployment id can be retrieved by going under
+        # Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com)
+        if self.smart_chip_deployment_id:
+            return
+
+        # Step 1: Check if the script already exists by searching the admin drive.
+        drive_service = get_drive_service(
+            self._creds, user_email=self.primary_admin_email
+        )
+        q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false"
+        script_search = (
+            drive_service.files()
+            .list(
+                corpora="user",
+                fields="files(mimeType, id, name)",
+                q=q,
+            )
+            .execute()
+        )
+        script_id = (script_search.get("files") or [{}])[0].get("id")
+        scripts_service = get_google_scripts_service(
+            self._creds, user_email=self.primary_admin_email
+        )
+        if not script_id:
+            # Step 2: Create the script if nonexistent
+            # (Takes about ~10 seconds)
+            req = scripts_service.projects().create(
+                body={"title": SMART_CHIP_SCRIPT_FILE_NAME}
+            )
+            response = req.execute()
+
+            if "scriptId" not in response:
+                raise RuntimeError(
+                    f"Failed to create Smart Chip App Script: {response}"
+                )
+
+            script_id = response["scriptId"]
+            scripts_files = create_scripts_file_objects()
+            # Step 3: Update (upload) the script content
+            response = (
+                scripts_service.projects()
+                .updateContent(scriptId=script_id, body={"files": scripts_files})
+                .execute()
+            )
+
+            if "scriptId" not in response:
+                raise RuntimeError(
+                    f"Failed to update Smart Chip App Script: {response}"
+                )
+
+            script_id = response["scriptId"]
+
+        # Step 4: Get the deployment id
+        self.smart_chip_deployment_id = self._get_latest_deployment(
+            scripts_service, script_id
+        )["deploymentId"]
+
+        # TODO: upsert new version if out of date. We don't expect to do this often.
+        # One way would be to check whether the script files have changed (either via git
+        # or actually pulling the current content and comparing).
+
     def _update_traversed_parent_ids(self, folder_id: str) -> None:
         self._retrieved_ids.add(folder_id)
 
@@ -951,6 +1065,7 @@ def _extract_docs_from_google_drive(
                 self.creds,
                 self.allow_images,
                 self.size_threshold,
+                self.smart_chip_deployment_id,
             )
             # Fetch files in batches
             batches_complete = 0
diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py
index ce3800e59e..cd4ff05914 100644
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -14,6 +14,7 @@
 from onyx.connectors.google_drive.section_extraction import get_document_sections
 from onyx.connectors.google_utils.resources import GoogleDocsService
 from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import DocumentFailure
@@ -224,6 +225,8 @@ def convert_drive_item_to_document(
     file: GoogleDriveFileType,
     drive_service: Callable[[], GoogleDriveService],
     docs_service: Callable[[], GoogleDocsService],
+    scripts_service: Callable[[], GoogleScriptsService],
+    smart_chips_deployment_id: str,
     allow_images: bool,
     size_threshold: int,
 ) -> Document | ConnectorFailure | None:
@@ -244,7 +247,10 @@ def convert_drive_item_to_document(
             try:
                 # get_document_sections is the advanced approach for Google Docs
                 doc_sections = get_document_sections(
-                    docs_service=docs_service(), doc_id=file.get("id", "")
+                    docs_service=docs_service(),
+                    scripts_service=scripts_service(),
+                    smart_chips_deployment_id=smart_chips_deployment_id,
+                    doc_id=file.get("id", ""),
                 )
                 if doc_sections:
                     sections = cast(list[TextSection | ImageSection], doc_sections)
diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py
index e571302fbc..2d78e31276 100644
--- a/backend/onyx/connectors/google_drive/section_extraction.py
+++ b/backend/onyx/connectors/google_drive/section_extraction.py
@@ -1,11 +1,18 @@
+from collections.abc import Callable
 from typing import Any
 
 from pydantic import BaseModel
 
+from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.connectors.google_utils.resources import GoogleDocsService
+from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.models import TextSection
 
 
+DRIVE_CHIP_CHAR = "\ue907"
+UNKNOWN_SMART_CHIP_STR = "<Smart Chip>"
+
+
 class CurrentHeading(BaseModel):
     id: str | None
     text: str
@@ -24,7 +31,9 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
     return paragraph["paragraphStyle"]["headingId"]
 
 
-def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
+def _extract_text_from_paragraph(
+    paragraph: dict[str, Any], extract_chip: Callable[[int], str | None]
+) -> tuple[str, int]:
     """Extracts the text content from a paragraph element"""
     text_elements = []
     for element in paragraph.get("elements", []):
@@ -33,39 +42,135 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
 
         # Handle links
         if "textStyle" in element and "link" in element["textStyle"]:
-            text_elements.append(f"[{element['textStyle']['link'].get('url', '')}]")
-    return "".join(text_elements)
+            text_elements.append(f"({element['textStyle']['link'].get('url', '')})")
+
+        if "person" in element:
+            name = element["person"].get("personProperties", {}).get("name", "")
+            email = element["person"].get("personProperties", {}).get("email", "")
+            person_str = "<Person|"
+            if name:
+                person_str += f"name: {name}, "
+            if email:
+                person_str += f"email: {email}"
+            person_str += ">"
+            text_elements.append(person_str)
+
+        if "richLink" in element:
+            props = element["richLink"].get("richLinkProperties", {})
+            title = props.get("title", "")
+            uri = props.get("uri", "")
+            link_str = f"[{title}]({uri})"
+            text_elements.append(link_str)
+
+    ret = "".join(text_elements)
+
+    # add chip strings in place of each non-text
+    text_chunks = ret.split(DRIVE_CHIP_CHAR)
+    num_non_text_elements = len(text_chunks) - 1
+    for i in range(num_non_text_elements):
+        text_chunks[i] += extract_chip(i) or UNKNOWN_SMART_CHIP_STR
+    return "".join(text_chunks), num_non_text_elements
+
+
+def _extract_smart_chips_from_document(
+    document_id: str,
+    scripts_service: GoogleScriptsService,
+    deployment_id: str,
+) -> dict[str, str]:
+    """Extracts smart chips from a Google Doc. Returns a dictionary where
+    the keys are the smart chip location keys and the values are the smart chip text.
+
+    This calls a Google Apps Script function, because most smart chips are not currently
+    available through the API https://issuetracker.google.com/issues/225584757
+
+    Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum".
+    nonTextIndexNum refers to the index at which the value was found while traversing
+    the paragraph or table cell from left to right, top to bottom.
+
+    There are many non-text elements that are currently not supported by Apps Script, (see
+    https://developers.google.com/apps-script/reference/document/element-type ), so some
+    non-text elements won't have an associated text value.
+    """
+
+    # NOTE: the documentation is incorrect; the script id you must specify is
+    # actually the deployment id (what comes up when you go to Deploy-> Test Deployments)
+    http_request = scripts_service.scripts().run(
+        scriptId=deployment_id,
+        body={
+            "function": "docToChips",
+            "parameters": [document_id],
+            # "devMode": True
+        },
+    )
+    doc = http_request.execute()
+    return doc.get("response", {}).get("result", {})
 
 
-def _extract_text_from_table(table: dict[str, Any]) -> str:
-    """Extracts the text content from a table element"""
+def _extract_text_from_table(
+    table: dict[str, Any], extract_chip: Callable[[int], str | None]
+) -> str:
+    """
+    Extracts the text content from a table element.
+    Smart chip extraction will be wrong for nested tables.
+    """
     row_strs = []
+    seen_non_text = 0
+
+    def table_extract_chip(non_text_index: int) -> str | None:
+        return extract_chip(non_text_index + seen_non_text)
+
     for row in table.get("tableRows", []):
         cells = row.get("tableCells", [])
-        cell_strs = [
-            _extract_text_from_paragraph(cell.get("content", {})) for cell in cells
-        ]
+        cell_strs = []
+        for cell in cells:
+            child_elements = cell.get("content", {})
+            cell_str = []
+            for child_elem in child_elements:
+                if "paragraph" not in child_elem:
+                    continue
+                text, num_non_text_elements = _extract_text_from_paragraph(
+                    child_elem["paragraph"], table_extract_chip
+                )
+                cell_str.append(text)
+                seen_non_text += num_non_text_elements
+
+            cell_strs.append("".join(cell_str))
         row_strs.append(", ".join(cell_strs))
     return "\n".join(row_strs)
 
 
 def get_document_sections(
     docs_service: GoogleDocsService,
+    scripts_service: GoogleScriptsService,
+    smart_chips_deployment_id: str,
     doc_id: str,
 ) -> list[TextSection]:
     """Extracts sections from a Google Doc, including their headings and content"""
     # Fetch the document structure
-    doc = (
-        docs_service.documents()
-        .get(documentId=doc_id, includeTabsContent=True)
-        .execute()
-    )
+    http_request = docs_service.documents().get(documentId=doc_id)
+
+    # Google has poor support for tabs in the docs api, see
+    # https://cloud.google.com/python/docs/reference/cloudtasks/
+    # latest/google.cloud.tasks_v2.types.HttpRequest
+    # https://developers.google.com/workspace/docs/api/how-tos/tabs
+    # https://developers.google.com/workspace/docs/api/reference/rest/v1/documents/get
+    # this is a hack to use the param mentioned in the rest api docs
+    # TODO: check if it can be specified i.e. in documents()
+    http_request.uri += "&includeTabsContent=true"
+    doc = http_request.execute()
+
+    smart_chips = {}
+    if USE_SMART_CHIP_SCOPES:
+        # Get the smart chips
+        smart_chips = _extract_smart_chips_from_document(
+            doc_id, scripts_service, smart_chips_deployment_id
+        )
 
     # Get the content
     tabs = doc.get("tabs", {})
     sections: list[TextSection] = []
-    for tab in tabs:
-        sections.extend(get_tab_sections(tab, doc_id))
+    for tab_num, tab in enumerate(tabs):
+        sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips))
     return sections
 
 
@@ -106,7 +211,9 @@ def _add_finished_section(
     )
 
 
-def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
+def get_tab_sections(
+    tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str]
+) -> list[TextSection]:
     tab_id = tab["tabProperties"]["tabId"]
     content = tab.get("documentTab", {}).get("body", {}).get("content", [])
 
@@ -114,13 +221,17 @@ def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
     current_section: list[str] = []
     current_heading = CurrentHeading(id=None, text="")
 
-    for element in content:
+    for element_num, element in enumerate(content):
+
+        def extract_chip(non_text_index: int) -> str | None:
+            return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}")
+
         if "paragraph" in element:
             paragraph = element["paragraph"]
 
             # If this is not a heading, add content to current section
             if not _is_heading(paragraph):
-                text = _extract_text_from_paragraph(paragraph)
+                text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
                 if text.strip():
                     current_section.append(text)
                 continue
@@ -133,13 +244,13 @@ def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
 
             # Start new heading
             heading_id = _extract_id_from_heading(paragraph)
-            heading_text = _extract_text_from_paragraph(paragraph)
+            heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
             current_heading = CurrentHeading(
                 id=heading_id,
                 text=heading_text,
             )
         elif "table" in element:
-            text = _extract_text_from_table(element["table"])
+            text = _extract_text_from_table(element["table"], extract_chip)
             if text.strip():
                 current_section.append(text)
 
diff --git a/backend/onyx/connectors/google_utils/google_utils.py b/backend/onyx/connectors/google_utils/google_utils.py
index 4ad6cbe7af..51ec2181a3 100644
--- a/backend/onyx/connectors/google_utils/google_utils.py
+++ b/backend/onyx/connectors/google_utils/google_utils.py
@@ -1,3 +1,4 @@
+import json
 import re
 import time
 from collections.abc import Callable
@@ -141,3 +142,50 @@ def execute_paginated_retrieval(
                 yield item
         else:
             yield results
+
+
+# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType
+class AppsScriptFileType(str, Enum):
+    UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED"
+    SERVER_JS = "SERVER_JS"
+    HTML = "HTML"
+    JSON = "JSON"
+
+
+SMART_CHIP_RETRIEVAL_FUNCTIONS = [
+    ("docToChips", ["document_id"]),
+    ("getKey", ["tabInd", "paragraphInd", "nonTextInd"]),
+    ("parseParagraph", ["paragraph", "callback"]),
+    ("parseTable", ["table", "callback"]),
+]
+
+SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor"
+
+
+# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent
+def create_scripts_file_objects() -> list[GoogleDriveFileType]:
+    with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f:
+        script_source = f.read()
+    with open("onyx/connectors/google_drive/appsscript.json", "r") as f:
+        appsscript_source = json.loads(f.read())
+    return [
+        {
+            "name": "appsscript",
+            "type": AppsScriptFileType.JSON.value,
+            "source": json.dumps(appsscript_source),
+        },
+        {
+            "name": SMART_CHIP_SCRIPT_FILE_NAME,
+            "type": AppsScriptFileType.SERVER_JS.value,
+            "source": script_source,
+            "functionSet": {
+                "values": [
+                    {
+                        "name": name,
+                        "parameters": params,
+                    }
+                    for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS
+                ],
+            },
+        },
+    ]
diff --git a/backend/onyx/connectors/google_utils/resources.py b/backend/onyx/connectors/google_utils/resources.py
index 48bd981c2b..a12f30cf67 100644
--- a/backend/onyx/connectors/google_utils/resources.py
+++ b/backend/onyx/connectors/google_utils/resources.py
@@ -12,6 +12,10 @@ class GoogleDocsService(Resource):
     pass
 
 
+class GoogleScriptsService(Resource):
+    pass
+
+
 class AdminService(Resource):
     pass
 
@@ -62,3 +66,10 @@ def get_gmail_service(
     user_email: str | None = None,
 ) -> GmailService:
     return _get_google_service("gmail", "v1", creds, user_email)
+
+
+def get_google_scripts_service(
+    creds: ServiceAccountCredentials | OAuthCredentials,
+    user_email: str | None = None,
+) -> GoogleScriptsService:
+    return _get_google_service("script", "v1", creds, user_email)
diff --git a/backend/onyx/connectors/google_utils/shared_constants.py b/backend/onyx/connectors/google_utils/shared_constants.py
index bd5ebb11ec..483ea6dd35 100644
--- a/backend/onyx/connectors/google_utils/shared_constants.py
+++ b/backend/onyx/connectors/google_utils/shared_constants.py
@@ -1,5 +1,6 @@
 from enum import Enum as PyEnum
 
+from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.configs.constants import DocumentSource
 
 # NOTE: do not need https://www.googleapis.com/auth/documents.readonly
@@ -18,6 +19,19 @@
     ],
 }
 
+# TODO: add this to the docs
+GOOGLE_SMART_CHIP_SCOPES = [
+    "https://www.googleapis.com/auth/script.external_request",
+    "https://www.googleapis.com/auth/drive.scripts",
+    "https://www.googleapis.com/auth/script.scriptapp",
+    "https://www.googleapis.com/auth/script.deployments",
+    "https://www.googleapis.com/auth/script.projects",
+    "https://www.googleapis.com/auth/documents",
+]
+
+if USE_SMART_CHIP_SCOPES:
+    GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES
+
 # This is the Oauth token
 DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
 # This is the service account key

From d31e1407f2843c4381a14d510a76a4f02ca1475e Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 4 Apr 2025 16:53:30 -0700
Subject: [PATCH 3/7] rebased and added scripts

---
 .../connectors/google_drive/appsscript.json   |  17 +++
 .../google_drive/smart_chip_retrieval.gs      | 132 ++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 backend/onyx/connectors/google_drive/appsscript.json
 create mode 100644 backend/onyx/connectors/google_drive/smart_chip_retrieval.gs

diff --git a/backend/onyx/connectors/google_drive/appsscript.json b/backend/onyx/connectors/google_drive/appsscript.json
new file mode 100644
index 0000000000..1a9e54269b
--- /dev/null
+++ b/backend/onyx/connectors/google_drive/appsscript.json
@@ -0,0 +1,17 @@
+{
+    "timeZone": "America/Los_Angeles",
+    "dependencies": {
+      "enabledAdvancedServices": [
+        {
+          "userSymbol": "Docs",
+          "version": "v1",
+          "serviceId": "docs"
+        }
+      ]
+    },
+    "exceptionLogging": "STACKDRIVER",
+    "runtimeVersion": "V8",
+    "executionApi": {
+      "access": "MYSELF"
+    }
+  }
\ No newline at end of file
diff --git a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
new file mode 100644
index 0000000000..0419adc179
--- /dev/null
+++ b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
@@ -0,0 +1,132 @@
+
+/**
+ * Retrieves the given Google doc by id and extracts dates, people, and rich links
+ * from it into a JSON keyed by tab, paragraph, and non-text-component index.
+ * 
+ */
+function docToChips(document_id) {
+  const doc = DocumentApp.openById(document_id);
+  const tabs = doc.getTabs();
+  const ret = new Map();
+  tabs.map((tab, tabInd) => {
+    const docTab = tab.asDocumentTab();
+    const body = docTab.getBody();
+    for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) {
+      var tabChild = body.getChild(tabChildInd);
+      var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;});
+      switch (tabChild.getType()) {
+        case DocumentApp.ElementType.PARAGRAPH:
+          parseParagraph(tabChild.asParagraph(), callback);
+          console.log("paragraph", tabChild.asParagraph().getText());
+          break;
+        case DocumentApp.ElementType.TABLE:
+          console.log("table");
+          parseTable(tabChild.asTable(), callback);
+          break;
+        case DocumentApp.ElementType.LIST_ITEM:
+          var listItem = tabChild.asListItem();
+          //console.log("list item:", listItem.getText(), listItem.getNumChildren());
+          //console.log(listItem.getChild(0).asText().getText());
+          parseParagraph(tabChild.asListItem(), callback);
+          break;
+        default:
+          console.log("found unknown tab body child of type: ", tabChild.getType().toString());
+      }
+    }
+  });
+  console.log(ret);
+  return ret;
+}
+
+// uncomment and paste in a file id (and change the main function to "test")
+// to test the docToChips function
+// function test() {
+//   return docToChips("document id goes here");
+// }
+
+function getKey(tabInd, paragraphInd, nonTextInd) {
+  return tabInd + "_" + paragraphInd + "_" + nonTextInd;
+}
+
+// also used for list items
+function parseParagraph(paragraph, callback) {
+  var nonTextInd = 0;
+  for (let i = 0; i < paragraph.getNumChildren(); i++) { //
+    var child = paragraph.getChild(i);
+    switch (child.getType()) {
+      case DocumentApp.ElementType.DATE:
+        console.log(child.asDate().getDisplayText());
+        callback(nonTextInd, child.asDate().getDisplayText());
+        break;
+      case DocumentApp.ElementType.EQUATION:
+        var eqStr = child.getText();
+        console.log("equation: ", eqStr);
+        callback(nonTextInd, eqStr);
+        break;
+      case DocumentApp.ElementType.PERSON:
+        var personStr = "<name: " + child.asPerson().getName() + ", email: "+ child.asPerson().getEmail() + ">";
+        console.log(personStr);
+        //callback(nonTextInd, personStr);
+        nonTextInd--; // Advanced Docs API picks up people
+        break;
+      case DocumentApp.ElementType.RICH_LINK:
+        var richLink = child.asRichLink()
+        var linkStr = "<title: " + richLink.getTitle() + ", type:" + richLink.getMimeType() + ">"
+        console.log(linkStr);
+        // callback(nonTextInd, child.asRichLink().getUrl());
+        nonTextInd--; // Advanced Docs API picks up rich links
+        break;
+      case DocumentApp.ElementType.TEXT:
+        console.log("text: "+ child.asText().getText());
+        //console.log(child.asText().)
+        nonTextInd--;
+        break;
+      case DocumentApp.ElementType.UNSUPPORTED:
+        console.log("unsupported element type");
+        break;
+      default:
+        console.log("found special element type:", child.getType().toString());
+    }
+    nonTextInd++;
+  }
+}
+
+function parseTable(table, callback) {
+  var lastSeenInCell = 0;
+  var allSeenElems = 0
+  const tableCallback = ((nonTextInd, replaceText) => {
+    callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText);
+    lastSeenInCell++;
+  });
+  for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) {
+    var row = table.getChild(rowInd);
+    if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) {
+      console.log("table child type: ", row.getType().toString());
+      continue;
+    }
+
+    for (let colInd = 0; colInd < row.getNumChildren(); colInd++) {
+      var cell = row.getChild(colInd);
+      if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) {
+        console.log("row child type: ", cell.getType().toString());
+        continue;
+      }
+
+      for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) {
+        var item = cell.getChild(itemInd);
+        console.log(item.getType().toString());
+        switch (item.getType()) {
+          case DocumentApp.ElementType.PARAGRAPH:
+          case DocumentApp.ElementType.LIST_ITEM:
+            parseParagraph(item, tableCallback);
+            break;
+          case DocumentApp.ElementType.TABLE:
+            parseTable(item, tableCallback);
+            break;
+        }
+      }
+      allSeenElems += lastSeenInCell;
+      lastSeenInCell = 0;
+    }
+  }
+}
\ No newline at end of file

From 7a67e1196acf2aab3dedba6dfb5881d6f214f7f2 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 4 Apr 2025 16:46:05 -0700
Subject: [PATCH 4/7] improved approach to extracting smart chips

---
 backend/onyx/configs/app_configs.py           |   2 -
 .../onyx/connectors/google_drive/connector.py | 115 ------------------
 .../connectors/google_drive/doc_conversion.py |  85 ++++++++++++-
 .../google_drive/section_extraction.py        | 104 +++-------------
 .../connectors/google_utils/google_utils.py   |  48 --------
 .../onyx/connectors/google_utils/resources.py |  11 --
 .../google_utils/shared_constants.py          |  14 ---
 backend/onyx/connectors/models.py             |   2 -
 8 files changed, 96 insertions(+), 285 deletions(-)

diff --git a/backend/onyx/configs/app_configs.py b/backend/onyx/configs/app_configs.py
index f45808f937..a293df6248 100644
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -163,8 +163,6 @@
 
 MAX_DRIVE_WORKERS = int(os.environ.get("MAX_DRIVE_WORKERS", 4))
 
-USE_SMART_CHIP_SCOPES = os.environ.get("USE_SMART_CHIP_SCOPES", "").lower() == "true"
-
 # Below are intended to match the env variables names used by the official postgres docker image
 # https://hub.docker.com/_/postgres
 POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py
index 551a10a73d..f9764535b2 100644
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@@ -18,7 +18,6 @@
 from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.app_configs import MAX_DRIVE_WORKERS
-from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.connectors.exceptions import CredentialExpiredError
@@ -40,16 +39,12 @@
 from onyx.connectors.google_drive.models import RetrievedDriveFile
 from onyx.connectors.google_drive.models import StageCompletion
 from onyx.connectors.google_utils.google_auth import get_google_creds
-from onyx.connectors.google_utils.google_utils import create_scripts_file_objects
 from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
 from onyx.connectors.google_utils.google_utils import GoogleFields
-from onyx.connectors.google_utils.google_utils import SMART_CHIP_SCRIPT_FILE_NAME
 from onyx.connectors.google_utils.resources import get_admin_service
 from onyx.connectors.google_utils.resources import get_drive_service
 from onyx.connectors.google_utils.resources import get_google_docs_service
-from onyx.connectors.google_utils.resources import get_google_scripts_service
 from onyx.connectors.google_utils.resources import GoogleDriveService
-from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.google_utils.shared_constants import (
     DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
 )
@@ -95,7 +90,6 @@ def _convert_single_file(
     creds: Any,
     allow_images: bool,
     size_threshold: int,
-    smart_chips_deployment_id: str,
     retriever_email: str,
     file: dict[str, Any],
 ) -> Document | ConnectorFailure | None:
@@ -113,15 +107,10 @@ def _convert_single_file(
     docs_service = lazy_eval(
         lambda: get_google_docs_service(creds, user_email=user_email)
     )
-    scripts_service = lazy_eval(
-        lambda: get_google_scripts_service(creds, user_email=user_email)
-    )
     return convert_drive_item_to_document(
         file=file,
         drive_service=user_drive_service,
         docs_service=docs_service,
-        scripts_service=scripts_service,
-        smart_chips_deployment_id=smart_chips_deployment_id,
         allow_images=allow_images,
         size_threshold=size_threshold,
     )
@@ -187,7 +176,6 @@ def __init__(
         my_drive_emails: str | None = None,
         shared_folder_urls: str | None = None,
         batch_size: int = INDEX_BATCH_SIZE,
-        smart_chip_deployment_id: str = "",
         # OLD PARAMETERS
         folder_paths: list[str] | None = None,
         include_shared: bool | None = None,
@@ -260,8 +248,6 @@ def __init__(
         self._retrieved_ids: set[str] = set()
         self.allow_images = False
 
-        self.smart_chip_deployment_id = smart_chip_deployment_id
-
         self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
 
     def set_allow_images(self, value: bool) -> None:
@@ -309,108 +295,8 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None
             source=DocumentSource.GOOGLE_DRIVE,
         )
 
-        if USE_SMART_CHIP_SCOPES:
-            self.upsert_smart_chip_app_script()
-
         return new_creds_dict
 
-    @staticmethod
-    def _get_latest_deployment(
-        scripts_service: GoogleScriptsService, script_id: str
-    ) -> dict[str, Any]:
-        deployments = (
-            scripts_service.projects()
-            .deployments()
-            .list(
-                scriptId=script_id,
-            )
-            .execute()
-        )
-        all_deployments = deployments.get("deployments", [])
-        while "nextPageToken" in deployments:
-            deployments = (
-                scripts_service.projects()
-                .deployments()
-                .list(
-                    scriptId=script_id,
-                    pageToken=deployments["nextPageToken"],
-                )
-                .execute()
-            )
-            all_deployments.extend(deployments.get("deployments", []))
-
-        if len(all_deployments) == 0:
-            raise RuntimeError(f"No deployments found for script {script_id}")
-        return max(
-            all_deployments,
-            key=lambda x: datetime.fromisoformat(x["updateTime"]).timestamp(),
-        )
-
-    def upsert_smart_chip_app_script(self) -> None:
-        assert self._creds is not None, "creds not set"
-
-        # If a deployment id is provided, we don't need to create a new script.
-        # The deployment id can be retrieved by going under
-        # Deploy -> Test deployments -> Head Deployment ID in the UI (script.google.com)
-        if self.smart_chip_deployment_id:
-            return
-
-        # Step 1: Check if the script already exists by searching the admin drive.
-        drive_service = get_drive_service(
-            self._creds, user_email=self.primary_admin_email
-        )
-        q = f"mimeType = 'application/vnd.google-apps.script' and name = '{SMART_CHIP_SCRIPT_FILE_NAME}' and trashed = false"
-        script_search = (
-            drive_service.files()
-            .list(
-                corpora="user",
-                fields="files(mimeType, id, name)",
-                q=q,
-            )
-            .execute()
-        )
-        script_id = (script_search.get("files") or [{}])[0].get("id")
-        scripts_service = get_google_scripts_service(
-            self._creds, user_email=self.primary_admin_email
-        )
-        if not script_id:
-            # Step 2: Create the script if nonexistent
-            # (Takes about ~10 seconds)
-            req = scripts_service.projects().create(
-                body={"title": SMART_CHIP_SCRIPT_FILE_NAME}
-            )
-            response = req.execute()
-
-            if "scriptId" not in response:
-                raise RuntimeError(
-                    f"Failed to create Smart Chip App Script: {response}"
-                )
-
-            script_id = response["scriptId"]
-            scripts_files = create_scripts_file_objects()
-            # Step 3: Update (upload) the script content
-            response = (
-                scripts_service.projects()
-                .updateContent(scriptId=script_id, body={"files": scripts_files})
-                .execute()
-            )
-
-            if "scriptId" not in response:
-                raise RuntimeError(
-                    f"Failed to update Smart Chip App Script: {response}"
-                )
-
-            script_id = response["scriptId"]
-
-        # Step 4: Get the deployment id
-        self.smart_chip_deployment_id = self._get_latest_deployment(
-            scripts_service, script_id
-        )["deploymentId"]
-
-        # TODO: upsert new version if out of date. We don't expect to do this often.
-        # One way would be to check whether the script files have changed (either via git
-        # or actually pulling the current content and comparing).
-
     def _update_traversed_parent_ids(self, folder_id: str) -> None:
         self._retrieved_ids.add(folder_id)
 
@@ -1065,7 +951,6 @@ def _extract_docs_from_google_drive(
                 self.creds,
                 self.allow_images,
                 self.size_threshold,
-                self.smart_chip_deployment_id,
             )
             # Fetch files in batches
             batches_complete = 0
diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py
index cd4ff05914..e80f6226d0 100644
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -12,9 +12,9 @@
 from onyx.connectors.google_drive.models import GDriveMimeType
 from onyx.connectors.google_drive.models import GoogleDriveFileType
 from onyx.connectors.google_drive.section_extraction import get_document_sections
+from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER
 from onyx.connectors.google_utils.resources import GoogleDocsService
 from onyx.connectors.google_utils.resources import GoogleDriveService
-from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import DocumentFailure
@@ -36,6 +36,10 @@
 
 logger = setup_logger()
 
+# This is not a standard valid unicode char, it is used by the docs advanced API to
+# represent smart chips (elements like dates and doc links).
+SMART_CHIP_CHAR = "\ue907"
+
 # Mapping of Google Drive mime types to export formats
 GOOGLE_MIME_TYPES_TO_EXPORT = {
     GDriveMimeType.DOC.value: "text/plain",
@@ -221,12 +225,79 @@ def _download_and_extract_sections_basic(
             return []
 
 
+def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
+    start = haystack.find(needle, start)
+    while start >= 0 and n > 1:
+        start = haystack.find(needle, start + len(needle))
+        n -= 1
+    return start
+
+
+def align_basic_advanced(
+    basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection]
+) -> list[TextSection | ImageSection]:
+    """Align the basic sections with the advanced sections.
+    In particular, the basic sections contain all content of the file,
+    including smart chips like dates and doc links. The advanced sections
+    are separated by section headers and contain header-based links that
+    improve user experience when they click on the source in the UI.
+
+    There are edge cases in text matching (i.e. the heading is a smart chip or
+    there is a smart chip in the doc with text containing the actual heading text)
+    that make the matching imperfect; this is hence done on a best-effort basis.
+    """
+    if len(adv_sections) <= 1:
+        return basic_sections  # no benefit from aligning
+
+    basic_full_text = "".join(
+        [section.text for section in basic_sections if isinstance(section, TextSection)]
+    )
+    new_sections: list[TextSection | ImageSection] = []
+    heading_start = 0
+    for adv_ind in range(1, len(adv_sections)):
+        heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0]
+        # retrieve the longest part of the heading that is not a smart chip
+        heading_key = max(heading.split(SMART_CHIP_CHAR), key=len).strip()
+        if heading_key == "":
+            logger.warning(
+                f"Cannot match heading: {heading}, its link will come from the following section"
+            )
+            continue
+        heading_offset = heading.find(heading_key)
+
+        # count occurrences of heading str in previous section
+        heading_count = adv_sections[adv_ind - 1].text.count(heading_key)
+
+        prev_start = heading_start
+        heading_start = (
+            _find_nth(basic_full_text, heading_key, heading_count, start=prev_start)
+            - heading_offset
+        )
+        if heading_start < 0:
+            logger.warning(
+                f"Heading key {heading_key} from heading {heading} not found in basic text"
+            )
+            heading_start = prev_start
+            continue
+
+        new_sections.append(
+            TextSection(
+                link=adv_sections[adv_ind - 1].link,
+                text=basic_full_text[prev_start:heading_start],
+            )
+        )
+
+    # handle last section
+    new_sections.append(
+        TextSection(link=adv_sections[-1].link, text=basic_full_text[heading_start:])
+    )
+    return new_sections
+
+
 def convert_drive_item_to_document(
     file: GoogleDriveFileType,
     drive_service: Callable[[], GoogleDriveService],
     docs_service: Callable[[], GoogleDocsService],
-    scripts_service: Callable[[], GoogleScriptsService],
-    smart_chips_deployment_id: str,
     allow_images: bool,
     size_threshold: int,
 ) -> Document | ConnectorFailure | None:
@@ -248,12 +319,16 @@ def convert_drive_item_to_document(
                 # get_document_sections is the advanced approach for Google Docs
                 doc_sections = get_document_sections(
                     docs_service=docs_service(),
-                    scripts_service=scripts_service(),
-                    smart_chips_deployment_id=smart_chips_deployment_id,
                     doc_id=file.get("id", ""),
                 )
                 if doc_sections:
                     sections = cast(list[TextSection | ImageSection], doc_sections)
+                    if any(SMART_CHIP_CHAR in section.text for section in doc_sections):
+                        basic_sections = _download_and_extract_sections_basic(
+                            file, drive_service(), allow_images
+                        )
+                        sections = align_basic_advanced(basic_sections, doc_sections)
+
             except Exception as e:
                 logger.warning(
                     f"Error in advanced parsing: {e}. Falling back to basic extraction."
diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py
index 2d78e31276..701d5d0c30 100644
--- a/backend/onyx/connectors/google_drive/section_extraction.py
+++ b/backend/onyx/connectors/google_drive/section_extraction.py
@@ -1,16 +1,11 @@
-from collections.abc import Callable
 from typing import Any
 
 from pydantic import BaseModel
 
-from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.connectors.google_utils.resources import GoogleDocsService
-from onyx.connectors.google_utils.resources import GoogleScriptsService
 from onyx.connectors.models import TextSection
 
-
-DRIVE_CHIP_CHAR = "\ue907"
-UNKNOWN_SMART_CHIP_STR = "<Smart Chip>"
+HEADING_DELIMITER = "\n"
 
 
 class CurrentHeading(BaseModel):
@@ -31,9 +26,7 @@ def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
     return paragraph["paragraphStyle"]["headingId"]
 
 
-def _extract_text_from_paragraph(
-    paragraph: dict[str, Any], extract_chip: Callable[[int], str | None]
-) -> tuple[str, int]:
+def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
     """Extracts the text content from a paragraph element"""
     text_elements = []
     for element in paragraph.get("elements", []):
@@ -62,62 +55,14 @@ def _extract_text_from_paragraph(
             link_str = f"[{title}]({uri})"
             text_elements.append(link_str)
 
-    ret = "".join(text_elements)
-
-    # add chip strings in place of each non-text
-    text_chunks = ret.split(DRIVE_CHIP_CHAR)
-    num_non_text_elements = len(text_chunks) - 1
-    for i in range(num_non_text_elements):
-        text_chunks[i] += extract_chip(i) or UNKNOWN_SMART_CHIP_STR
-    return "".join(text_chunks), num_non_text_elements
-
-
-def _extract_smart_chips_from_document(
-    document_id: str,
-    scripts_service: GoogleScriptsService,
-    deployment_id: str,
-) -> dict[str, str]:
-    """Extracts smart chips from a Google Doc. Returns a dictionary where
-    the keys are the smart chip location keys and the values are the smart chip text.
-
-    This calls a Google Apps Script function, because most smart chips are not currently
-    available through the API https://issuetracker.google.com/issues/225584757
+    return "".join(text_elements)
 
-    Each location key is formatted as "tabNum_paragraphNum_nonTextIndexNum".
-    nonTextIndexNum refers to the index at which the value was found while traversing
-    the paragraph or table cell from left to right, top to bottom.
 
-    There are many non-text elements that are currently not supported by Apps Script, (see
-    https://developers.google.com/apps-script/reference/document/element-type ), so some
-    non-text elements won't have an associated text value.
-    """
-
-    # NOTE: the documentation is incorrect; the script id you must specify is
-    # actually the deployment id (what comes up when you go to Deploy-> Test Deployments)
-    http_request = scripts_service.scripts().run(
-        scriptId=deployment_id,
-        body={
-            "function": "docToChips",
-            "parameters": [document_id],
-            # "devMode": True
-        },
-    )
-    doc = http_request.execute()
-    return doc.get("response", {}).get("result", {})
-
-
-def _extract_text_from_table(
-    table: dict[str, Any], extract_chip: Callable[[int], str | None]
-) -> str:
+def _extract_text_from_table(table: dict[str, Any]) -> str:
     """
     Extracts the text content from a table element.
-    Smart chip extraction will be wrong for nested tables.
     """
     row_strs = []
-    seen_non_text = 0
-
-    def table_extract_chip(non_text_index: int) -> str | None:
-        return extract_chip(non_text_index + seen_non_text)
 
     for row in table.get("tableRows", []):
         cells = row.get("tableCells", [])
@@ -128,12 +73,7 @@ def table_extract_chip(non_text_index: int) -> str | None:
             for child_elem in child_elements:
                 if "paragraph" not in child_elem:
                     continue
-                text, num_non_text_elements = _extract_text_from_paragraph(
-                    child_elem["paragraph"], table_extract_chip
-                )
-                cell_str.append(text)
-                seen_non_text += num_non_text_elements
-
+                cell_str.append(_extract_text_from_paragraph(child_elem["paragraph"]))
             cell_strs.append("".join(cell_str))
         row_strs.append(", ".join(cell_strs))
     return "\n".join(row_strs)
@@ -141,8 +81,6 @@ def table_extract_chip(non_text_index: int) -> str | None:
 
 def get_document_sections(
     docs_service: GoogleDocsService,
-    scripts_service: GoogleScriptsService,
-    smart_chips_deployment_id: str,
     doc_id: str,
 ) -> list[TextSection]:
     """Extracts sections from a Google Doc, including their headings and content"""
@@ -159,18 +97,11 @@ def get_document_sections(
     http_request.uri += "&includeTabsContent=true"
     doc = http_request.execute()
 
-    smart_chips = {}
-    if USE_SMART_CHIP_SCOPES:
-        # Get the smart chips
-        smart_chips = _extract_smart_chips_from_document(
-            doc_id, scripts_service, smart_chips_deployment_id
-        )
-
     # Get the content
     tabs = doc.get("tabs", {})
     sections: list[TextSection] = []
-    for tab_num, tab in enumerate(tabs):
-        sections.extend(get_tab_sections(tab, doc_id, tab_num, smart_chips))
+    for tab in tabs:
+        sections.extend(get_tab_sections(tab, doc_id))
     return sections
 
 
@@ -202,7 +133,10 @@ def _add_finished_section(
     if not (current_section or current_heading.text):
         return
     # If we were building a previous section, add it to sections list
-    section_text = f"{current_heading.text}\n" + "\n".join(current_section)
+
+    # this is unlikely to ever matter, but helps if the doc contains weird headings
+    header_text = current_heading.text.replace(HEADING_DELIMITER, "")
+    section_text = f"{header_text}{HEADING_DELIMITER}" + "\n".join(current_section)
     sections.append(
         TextSection(
             text=section_text.strip(),
@@ -211,9 +145,7 @@ def _add_finished_section(
     )
 
 
-def get_tab_sections(
-    tab: dict[str, Any], doc_id: str, tab_num: int, smart_chips: dict[str, str]
-) -> list[TextSection]:
+def get_tab_sections(tab: dict[str, Any], doc_id: str) -> list[TextSection]:
     tab_id = tab["tabProperties"]["tabId"]
     content = tab.get("documentTab", {}).get("body", {}).get("content", [])
 
@@ -221,17 +153,13 @@ def get_tab_sections(
     current_section: list[str] = []
     current_heading = CurrentHeading(id=None, text="")
 
-    for element_num, element in enumerate(content):
-
-        def extract_chip(non_text_index: int) -> str | None:
-            return smart_chips.get(f"{tab_num}_{element_num-1}_{non_text_index}")
-
+    for element in content:
         if "paragraph" in element:
             paragraph = element["paragraph"]
 
             # If this is not a heading, add content to current section
             if not _is_heading(paragraph):
-                text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
+                text = _extract_text_from_paragraph(paragraph)
                 if text.strip():
                     current_section.append(text)
                 continue
@@ -244,13 +172,13 @@ def extract_chip(non_text_index: int) -> str | None:
 
             # Start new heading
             heading_id = _extract_id_from_heading(paragraph)
-            heading_text, _ = _extract_text_from_paragraph(paragraph, extract_chip)
+            heading_text = _extract_text_from_paragraph(paragraph)
             current_heading = CurrentHeading(
                 id=heading_id,
                 text=heading_text,
             )
         elif "table" in element:
-            text = _extract_text_from_table(element["table"], extract_chip)
+            text = _extract_text_from_table(element["table"])
             if text.strip():
                 current_section.append(text)
 
diff --git a/backend/onyx/connectors/google_utils/google_utils.py b/backend/onyx/connectors/google_utils/google_utils.py
index 51ec2181a3..4ad6cbe7af 100644
--- a/backend/onyx/connectors/google_utils/google_utils.py
+++ b/backend/onyx/connectors/google_utils/google_utils.py
@@ -1,4 +1,3 @@
-import json
 import re
 import time
 from collections.abc import Callable
@@ -142,50 +141,3 @@ def execute_paginated_retrieval(
                 yield item
         else:
             yield results
-
-
-# https://developers.google.com/apps-script/api/reference/rest/v1/File#FileType
-class AppsScriptFileType(str, Enum):
-    UNSPECIFIED = "ENUM_TYPE_UNSPECIFIED"
-    SERVER_JS = "SERVER_JS"
-    HTML = "HTML"
-    JSON = "JSON"
-
-
-SMART_CHIP_RETRIEVAL_FUNCTIONS = [
-    ("docToChips", ["document_id"]),
-    ("getKey", ["tabInd", "paragraphInd", "nonTextInd"]),
-    ("parseParagraph", ["paragraph", "callback"]),
-    ("parseTable", ["table", "callback"]),
-]
-
-SMART_CHIP_SCRIPT_FILE_NAME = "Smart_Chip_Extractor"
-
-
-# https://developers.google.com/apps-script/api/reference/rest/v1/projects/updateContent
-def create_scripts_file_objects() -> list[GoogleDriveFileType]:
-    with open("onyx/connectors/google_drive/smart_chip_retrieval.gs", "r") as f:
-        script_source = f.read()
-    with open("onyx/connectors/google_drive/appsscript.json", "r") as f:
-        appsscript_source = json.loads(f.read())
-    return [
-        {
-            "name": "appsscript",
-            "type": AppsScriptFileType.JSON.value,
-            "source": json.dumps(appsscript_source),
-        },
-        {
-            "name": SMART_CHIP_SCRIPT_FILE_NAME,
-            "type": AppsScriptFileType.SERVER_JS.value,
-            "source": script_source,
-            "functionSet": {
-                "values": [
-                    {
-                        "name": name,
-                        "parameters": params,
-                    }
-                    for name, params in SMART_CHIP_RETRIEVAL_FUNCTIONS
-                ],
-            },
-        },
-    ]
diff --git a/backend/onyx/connectors/google_utils/resources.py b/backend/onyx/connectors/google_utils/resources.py
index a12f30cf67..48bd981c2b 100644
--- a/backend/onyx/connectors/google_utils/resources.py
+++ b/backend/onyx/connectors/google_utils/resources.py
@@ -12,10 +12,6 @@ class GoogleDocsService(Resource):
     pass
 
 
-class GoogleScriptsService(Resource):
-    pass
-
-
 class AdminService(Resource):
     pass
 
@@ -66,10 +62,3 @@ def get_gmail_service(
     user_email: str | None = None,
 ) -> GmailService:
     return _get_google_service("gmail", "v1", creds, user_email)
-
-
-def get_google_scripts_service(
-    creds: ServiceAccountCredentials | OAuthCredentials,
-    user_email: str | None = None,
-) -> GoogleScriptsService:
-    return _get_google_service("script", "v1", creds, user_email)
diff --git a/backend/onyx/connectors/google_utils/shared_constants.py b/backend/onyx/connectors/google_utils/shared_constants.py
index 483ea6dd35..bd5ebb11ec 100644
--- a/backend/onyx/connectors/google_utils/shared_constants.py
+++ b/backend/onyx/connectors/google_utils/shared_constants.py
@@ -1,6 +1,5 @@
 from enum import Enum as PyEnum
 
-from onyx.configs.app_configs import USE_SMART_CHIP_SCOPES
 from onyx.configs.constants import DocumentSource
 
 # NOTE: do not need https://www.googleapis.com/auth/documents.readonly
@@ -19,19 +18,6 @@
     ],
 }
 
-# TODO: add this to the docs
-GOOGLE_SMART_CHIP_SCOPES = [
-    "https://www.googleapis.com/auth/script.external_request",
-    "https://www.googleapis.com/auth/drive.scripts",
-    "https://www.googleapis.com/auth/script.scriptapp",
-    "https://www.googleapis.com/auth/script.deployments",
-    "https://www.googleapis.com/auth/script.projects",
-    "https://www.googleapis.com/auth/documents",
-]
-
-if USE_SMART_CHIP_SCOPES:
-    GOOGLE_SCOPES[DocumentSource.GOOGLE_DRIVE] += GOOGLE_SMART_CHIP_SCOPES
-
 # This is the Oauth token
 DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
 # This is the service account key
diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py
index 4dfadb1d9c..4fe586897b 100644
--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -39,7 +39,6 @@ class TextSection(Section):
     """Section containing text content"""
 
     text: str
-    link: str | None = None
 
     def __sizeof__(self) -> int:
         return sys.getsizeof(self.text) + sys.getsizeof(self.link)
@@ -49,7 +48,6 @@ class ImageSection(Section):
     """Section containing an image reference"""
 
     image_file_name: str
-    link: str | None = None
 
     def __sizeof__(self) -> int:
         return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)

From a226361d8b4bf82779d2ea686709d95588b3a086 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 4 Apr 2025 17:52:52 -0700
Subject: [PATCH 5/7]  remove files from previous branch

---
 .../connectors/google_drive/appsscript.json   |  17 ---
 .../google_drive/smart_chip_retrieval.gs      | 132 ------------------
 2 files changed, 149 deletions(-)
 delete mode 100644 backend/onyx/connectors/google_drive/appsscript.json
 delete mode 100644 backend/onyx/connectors/google_drive/smart_chip_retrieval.gs

diff --git a/backend/onyx/connectors/google_drive/appsscript.json b/backend/onyx/connectors/google_drive/appsscript.json
deleted file mode 100644
index 1a9e54269b..0000000000
--- a/backend/onyx/connectors/google_drive/appsscript.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "timeZone": "America/Los_Angeles",
-    "dependencies": {
-      "enabledAdvancedServices": [
-        {
-          "userSymbol": "Docs",
-          "version": "v1",
-          "serviceId": "docs"
-        }
-      ]
-    },
-    "exceptionLogging": "STACKDRIVER",
-    "runtimeVersion": "V8",
-    "executionApi": {
-      "access": "MYSELF"
-    }
-  }
\ No newline at end of file
diff --git a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs b/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
deleted file mode 100644
index 0419adc179..0000000000
--- a/backend/onyx/connectors/google_drive/smart_chip_retrieval.gs
+++ /dev/null
@@ -1,132 +0,0 @@
-
-/**
- * Retrieves the given Google doc by id and extracts dates, people, and rich links
- * from it into a JSON keyed by tab, paragraph, and non-text-component index.
- * 
- */
-function docToChips(document_id) {
-  const doc = DocumentApp.openById(document_id);
-  const tabs = doc.getTabs();
-  const ret = new Map();
-  tabs.map((tab, tabInd) => {
-    const docTab = tab.asDocumentTab();
-    const body = docTab.getBody();
-    for (let tabChildInd = 0; tabChildInd < body.getNumChildren(); tabChildInd++) {
-      var tabChild = body.getChild(tabChildInd);
-      var callback = ((nonTextInd, replaceText) => {ret[getKey(tabInd, tabChildInd, nonTextInd)] = replaceText;});
-      switch (tabChild.getType()) {
-        case DocumentApp.ElementType.PARAGRAPH:
-          parseParagraph(tabChild.asParagraph(), callback);
-          console.log("paragraph", tabChild.asParagraph().getText());
-          break;
-        case DocumentApp.ElementType.TABLE:
-          console.log("table");
-          parseTable(tabChild.asTable(), callback);
-          break;
-        case DocumentApp.ElementType.LIST_ITEM:
-          var listItem = tabChild.asListItem();
-          //console.log("list item:", listItem.getText(), listItem.getNumChildren());
-          //console.log(listItem.getChild(0).asText().getText());
-          parseParagraph(tabChild.asListItem(), callback);
-          break;
-        default:
-          console.log("found unknown tab body child of type: ", tabChild.getType().toString());
-      }
-    }
-  });
-  console.log(ret);
-  return ret;
-}
-
-// uncomment and paste in a file id (and change the main function to "test")
-// to test the docToChips function
-// function test() {
-//   return docToChips("document id goes here");
-// }
-
-function getKey(tabInd, paragraphInd, nonTextInd) {
-  return tabInd + "_" + paragraphInd + "_" + nonTextInd;
-}
-
-// also used for list items
-function parseParagraph(paragraph, callback) {
-  var nonTextInd = 0;
-  for (let i = 0; i < paragraph.getNumChildren(); i++) { //
-    var child = paragraph.getChild(i);
-    switch (child.getType()) {
-      case DocumentApp.ElementType.DATE:
-        console.log(child.asDate().getDisplayText());
-        callback(nonTextInd, child.asDate().getDisplayText());
-        break;
-      case DocumentApp.ElementType.EQUATION:
-        var eqStr = child.getText();
-        console.log("equation: ", eqStr);
-        callback(nonTextInd, eqStr);
-        break;
-      case DocumentApp.ElementType.PERSON:
-        var personStr = "<name: " + child.asPerson().getName() + ", email: "+ child.asPerson().getEmail() + ">";
-        console.log(personStr);
-        //callback(nonTextInd, personStr);
-        nonTextInd--; // Advanced Docs API picks up people
-        break;
-      case DocumentApp.ElementType.RICH_LINK:
-        var richLink = child.asRichLink()
-        var linkStr = "<title: " + richLink.getTitle() + ", type:" + richLink.getMimeType() + ">"
-        console.log(linkStr);
-        // callback(nonTextInd, child.asRichLink().getUrl());
-        nonTextInd--; // Advanced Docs API picks up rich links
-        break;
-      case DocumentApp.ElementType.TEXT:
-        console.log("text: "+ child.asText().getText());
-        //console.log(child.asText().)
-        nonTextInd--;
-        break;
-      case DocumentApp.ElementType.UNSUPPORTED:
-        console.log("unsupported element type");
-        break;
-      default:
-        console.log("found special element type:", child.getType().toString());
-    }
-    nonTextInd++;
-  }
-}
-
-function parseTable(table, callback) {
-  var lastSeenInCell = 0;
-  var allSeenElems = 0
-  const tableCallback = ((nonTextInd, replaceText) => {
-    callback(allSeenElems + lastSeenInCell + nonTextInd, replaceText);
-    lastSeenInCell++;
-  });
-  for (let rowInd = 0; rowInd < table.getNumChildren(); rowInd++) {
-    var row = table.getChild(rowInd);
-    if (row.getType() !== DocumentApp.ElementType.TABLE_ROW) {
-      console.log("table child type: ", row.getType().toString());
-      continue;
-    }
-
-    for (let colInd = 0; colInd < row.getNumChildren(); colInd++) {
-      var cell = row.getChild(colInd);
-      if (cell.getType() !== DocumentApp.ElementType.TABLE_CELL) {
-        console.log("row child type: ", cell.getType().toString());
-        continue;
-      }
-
-      for (let itemInd = 0; itemInd < cell.getNumChildren(); itemInd++) {
-        var item = cell.getChild(itemInd);
-        console.log(item.getType().toString());
-        switch (item.getType()) {
-          case DocumentApp.ElementType.PARAGRAPH:
-          case DocumentApp.ElementType.LIST_ITEM:
-            parseParagraph(item, tableCallback);
-            break;
-          case DocumentApp.ElementType.TABLE:
-            parseTable(item, tableCallback);
-            break;
-        }
-      }
-      allSeenElems += lastSeenInCell;
-      lastSeenInCell = 0;
-    }
-  }
-}
\ No newline at end of file

From 47a1f8685e2885560fa26d5a844cded1c6e7ead2 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 4 Apr 2025 17:55:12 -0700
Subject: [PATCH 6/7] fix connector tests

---
 .../daily/connectors/google_drive/consts_and_utils.py  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/backend/tests/daily/connectors/google_drive/consts_and_utils.py b/backend/tests/daily/connectors/google_drive/consts_and_utils.py
index f08a587870..678d958da0 100644
--- a/backend/tests/daily/connectors/google_drive/consts_and_utils.py
+++ b/backend/tests/daily/connectors/google_drive/consts_and_utils.py
@@ -123,15 +123,15 @@
 
 SPECIAL_FILE_ID_TO_CONTENT_MAP: dict[int, str] = {
     61: (
-        "Title\n\n"
+        "Title\n"
         "This is a Google Doc with sections - "
-        "Section 1\n\n"
+        "Section 1\n"
         "Section 1 content - "
-        "Sub-Section 1-1\n\n"
+        "Sub-Section 1-1\n"
         "Sub-Section 1-1 content - "
-        "Sub-Section 1-2\n\n"
+        "Sub-Section 1-2\n"
         "Sub-Section 1-2 content - "
-        "Section 2\n\n"
+        "Section 2\n"
         "Section 2 content"
     ),
 }

From 2356674d0990088b63e49d968d7ff3250fa98577 Mon Sep 17 00:00:00 2001
From: Evan Lohn <evan@danswer.ai>
Date: Fri, 4 Apr 2025 20:25:05 -0700
Subject: [PATCH 7/7] fix test

---
 .../daily/connectors/google_drive/test_sections.py     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/backend/tests/daily/connectors/google_drive/test_sections.py b/backend/tests/daily/connectors/google_drive/test_sections.py
index 6dd22a0e52..25237d701f 100644
--- a/backend/tests/daily/connectors/google_drive/test_sections.py
+++ b/backend/tests/daily/connectors/google_drive/test_sections.py
@@ -46,28 +46,28 @@ def test_google_drive_sections(
         assert len(doc.sections) == 5
 
         header_section = doc.sections[0]
-        assert header_section.text == "Title\n\nThis is a Google Doc with sections"
+        assert header_section.text == "Title\nThis is a Google Doc with sections"
         assert header_section.link is not None
         assert header_section.link.endswith(
             "?tab=t.0#heading=h.hfjc17k6qwzt"
         ) or header_section.link.endswith("?tab=t.0#heading=h.hfjc17k6qwzt")
 
         section_1 = doc.sections[1]
-        assert section_1.text == "Section 1\n\nSection 1 content"
+        assert section_1.text == "Section 1\nSection 1 content"
         assert section_1.link is not None
         assert section_1.link.endswith("?tab=t.0#heading=h.8slfx752a3g5")
 
         section_2 = doc.sections[2]
-        assert section_2.text == "Sub-Section 1-1\n\nSub-Section 1-1 content"
+        assert section_2.text == "Sub-Section 1-1\nSub-Section 1-1 content"
         assert section_2.link is not None
         assert section_2.link.endswith("?tab=t.0#heading=h.4kj3ayade1bp")
 
         section_3 = doc.sections[3]
-        assert section_3.text == "Sub-Section 1-2\n\nSub-Section 1-2 content"
+        assert section_3.text == "Sub-Section 1-2\nSub-Section 1-2 content"
         assert section_3.link is not None
         assert section_3.link.endswith("?tab=t.0#heading=h.pm6wrpzgk69l")
 
         section_4 = doc.sections[4]
-        assert section_4.text == "Section 2\n\nSection 2 content"
+        assert section_4.text == "Section 2\nSection 2 content"
         assert section_4.link is not None
         assert section_4.link.endswith("?tab=t.0#heading=h.2m0s9youe2k9")